microsoft/TypeAgent
Publicmirrored from https://github.com/microsoft/TypeAgentAvailable
python/fineTuning/unsloth/nltkExtract.py
905lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | from rake_nltk import Rake |
| 5 | import sys |
| 6 | import os |
| 7 | from argparse import ArgumentParser |
| 8 | import nltk |
| 9 | from nltk import word_tokenize, pos_tag, ne_chunk |
| 10 | from nltk.tree import Tree |
| 11 | import spacy |
| 12 | import yake |
| 13 | from keybert import KeyBERT |
| 14 | |
| 15 | parser = ArgumentParser(description="Extract keywords from dataset using NLTK-RAKE.") |
| 16 | parser.add_argument("--dataset_path", type=str, default='/data/npr_chunks_no_embedding_seed127_samples5000_test.json', help="Path to the dataset file.") |
| 17 | parser.add_argument("--max_length", type=int, default=1, help="Maximum number of words in a keyword phrase.") |
| 18 | parser.add_argument("--output_file", type=str, default='extraction.txt', help="Path to the output file.") |
| 19 | parser.add_argument("--verbose", action='store_true', help="Enable verbose output (shows all extraction details). Default is non-verbose.") |
| 20 | args = parser.parse_args(sys.argv[1:]) |
| 21 | dataset_path = args.dataset_path |
| 22 | max_length = args.max_length |
| 23 | output_file = args.output_file |
| 24 | verbose = args.verbose |
| 25 | |
| 26 | # Initialize RAKE with max_length configuration |
| 27 | rake = Rake(max_length=max_length) |
| 28 | |
| 29 | # Initialize YAKE keyword extractor |
| 30 | # Parameters: language, max_ngram_size, deduplication_threshold, number of keywords |
| 31 | yake_extractor = yake.KeywordExtractor(lan="en", n=max_length, dedupLim=0.9, top=20) |
| 32 | |
| 33 | # Initialize KeyBERT model |
| 34 | keybert_model = KeyBERT() |
| 35 | |
| 36 | # Load spacy model with only necessary components |
| 37 | # Keep: tok2vec, tagger, parser, lemmatizer |
| 38 | # Disable: ner, attribute_ruler, and any other unused components |
| 39 | nlp = spacy.load("en_core_web_sm", disable=["ner"]) |
| 40 | |
| 41 | # Load an array of JSON objects with properties like speaker and content |
| 42 | import json |
| 43 | import time |
| 44 | |
| 45 | with open(dataset_path) as f: |
| 46 | rawData = json.load(f) |
| 47 | |
| 48 | def extract_keywords_rake(text): |
| 49 | """Extract keywords from text using RAKE.""" |
| 50 | rake.extract_keywords_from_text(text) |
| 51 | |
| 52 | # Get ranked phrases with scores (already limited by max_length configuration) |
| 53 | keywords = rake.get_ranked_phrases_with_scores() |
| 54 | |
| 55 | return keywords |
| 56 | |
| 57 | def extract_keywords_yake(text): |
| 58 | """Extract keywords from text using YAKE.""" |
| 59 | # YAKE returns (keyword, score) where lower scores are better |
| 60 | # Reverse to (score, keyword) to match RAKE format |
| 61 | keywords = [(score, keyword) for keyword, score in yake_extractor.extract_keywords(text)] |
| 62 | |
| 63 | return keywords |
| 64 | |
| 65 | def extract_keywords_keybert(text): |
| 66 | """Extract keywords from text using KeyBERT.""" |
| 67 | # KeyBERT returns (keyword, score) where higher scores are better |
| 68 | # Reverse to (score, keyword) to match RAKE format |
| 69 | keywords = keybert_model.extract_keywords(text, keyphrase_ngram_range=(1, max_length), top_n=20) |
| 70 | keywords = [(score, keyword) for keyword, score in keywords] |
| 71 | |
| 72 | return keywords |
| 73 | |
| 74 | def extract_named_entities(text): |
| 75 | """Extract named entities from text using NLTK.""" |
| 76 | # Tokenize and tag parts of speech |
| 77 | tokens = word_tokenize(text) |
| 78 | pos_tags = pos_tag(tokens) |
| 79 | |
| 80 | # Extract named entities |
| 81 | named_entities = ne_chunk(pos_tags, binary=False) |
| 82 | |
| 83 | # Parse the tree to extract entities with their types |
| 84 | entities = [] |
| 85 | for chunk in named_entities: |
| 86 | if isinstance(chunk, Tree): |
| 87 | entity_type = chunk.label() |
| 88 | entity_text = " ".join([token for token, pos in chunk.leaves()]) |
| 89 | entities.append((entity_text, entity_type)) |
| 90 | |
| 91 | return entities |
| 92 | |
| 93 | def extract_phrases(text): |
| 94 | """Extract noun phrases and verb phrases from text using spacy.""" |
| 95 | doc = nlp(text) |
| 96 | |
| 97 | # Extract noun phrases |
| 98 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 99 | |
| 100 | # Extract verb phrases (tokens with verb POS and their dependents) |
| 101 | verb_phrases = [] |
| 102 | for token in doc: |
| 103 | if token.pos_ == "VERB": |
| 104 | # Get the verb and its direct object/complement |
| 105 | phrase_tokens = [token.text] |
| 106 | for child in token.children: |
| 107 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 108 | phrase_tokens.append(child.text) |
| 109 | if len(phrase_tokens) > 1: |
| 110 | verb_phrases.append(" ".join(phrase_tokens)) |
| 111 | else: |
| 112 | verb_phrases.append(token.text) |
| 113 | |
| 114 | return noun_phrases, verb_phrases |
| 115 | |
| 116 | def extract_dependencies(text): |
| 117 | """Extract dependency relations from text using spacy with sentence boundaries.""" |
| 118 | doc = nlp(text) |
| 119 | |
| 120 | # Process each sentence separately for better dependency analysis |
| 121 | sentences_deps = [] |
| 122 | for sent in doc.sents: |
| 123 | sent_deps = [] |
| 124 | verb_relations = [] |
| 125 | nouns_in_verb_relations = set() |
| 126 | |
| 127 | for token in sent: |
| 128 | # Extract: token, POS tag, dependency relation, head token |
| 129 | dep_info = { |
| 130 | 'token': token.text, |
| 131 | 'pos': token.pos_, |
| 132 | 'dep': token.dep_, |
| 133 | 'head': token.head.text, |
| 134 | 'head_pos': token.head.pos_ |
| 135 | } |
| 136 | sent_deps.append(dep_info) |
| 137 | |
| 138 | # If token is a verb, extract its NOUN and ADV dependencies |
| 139 | if token.pos_ == "VERB": |
| 140 | nouns = [] |
| 141 | advs = [] |
| 142 | for child in token.children: |
| 143 | if child.pos_ == "NOUN" or child.pos_ == "PROPN": |
| 144 | nouns.append(child.text) |
| 145 | nouns_in_verb_relations.add(child.text) |
| 146 | elif child.pos_ == "ADV": |
| 147 | advs.append(child.text) |
| 148 | |
| 149 | verb_relations.append({ |
| 150 | 'verb': token.text, |
| 151 | 'nouns': nouns, |
| 152 | 'advs': advs |
| 153 | }) |
| 154 | |
| 155 | # Find nouns that are NOT dependent on any verb |
| 156 | independent_nouns = [] |
| 157 | for token in sent: |
| 158 | if (token.pos_ == "NOUN" or token.pos_ == "PROPN") and token.text not in nouns_in_verb_relations: |
| 159 | noun_info = { |
| 160 | 'noun': token.text, |
| 161 | 'dep': token.dep_, |
| 162 | 'head': token.head.text, |
| 163 | 'head_pos': token.head.pos_ |
| 164 | } |
| 165 | |
| 166 | # If the noun depends on a preposition, also show what the preposition depends on |
| 167 | if token.head.pos_ == "ADP": |
| 168 | noun_info['prep_head'] = token.head.head.text |
| 169 | noun_info['prep_head_pos'] = token.head.head.pos_ |
| 170 | noun_info['prep_dep'] = token.head.dep_ |
| 171 | |
| 172 | independent_nouns.append(noun_info) |
| 173 | |
| 174 | sentences_deps.append({ |
| 175 | 'sentence': sent.text, |
| 176 | 'dependencies': sent_deps, |
| 177 | 'verb_relations': verb_relations, |
| 178 | 'independent_nouns': independent_nouns |
| 179 | }) |
| 180 | |
| 181 | return sentences_deps, doc |
| 182 | |
| 183 | def lemmatize_rake_keyphrases_from_doc(rake_keywords, doc): |
| 184 | """Lemmatize RAKE keyphrases using existing spacy doc by finding token offsets.""" |
| 185 | lemmatized = [] |
| 186 | |
| 187 | for score, keyphrase in rake_keywords: |
| 188 | keyphrase_lower = keyphrase.lower() |
| 189 | keyphrase_tokens = keyphrase_lower.split() |
| 190 | |
| 191 | # Try to find matching sequence of tokens in doc |
| 192 | matched_tokens = [] |
| 193 | doc_tokens = [t for t in doc if not t.is_punct] |
| 194 | |
| 195 | for i in range(len(doc_tokens)): |
| 196 | # Check if we have a match starting at position i |
| 197 | temp_tokens = [] |
| 198 | |
| 199 | for j, kp_token in enumerate(keyphrase_tokens): |
| 200 | if i + j < len(doc_tokens): |
| 201 | if doc_tokens[i + j].text.lower() == kp_token: |
| 202 | temp_tokens.append(doc_tokens[i + j]) |
| 203 | else: |
| 204 | temp_tokens.clear() |
| 205 | break |
| 206 | else: |
| 207 | temp_tokens.clear() |
| 208 | break |
| 209 | |
| 210 | if len(temp_tokens) == len(keyphrase_tokens): |
| 211 | matched_tokens = temp_tokens |
| 212 | break |
| 213 | |
| 214 | # Get lemmas from matched tokens |
| 215 | if matched_tokens: |
| 216 | lemmas = [token.lemma_ for token in matched_tokens] |
| 217 | lemmatized_phrase = " ".join(lemmas) |
| 218 | else: |
| 219 | # If no match found, just keep the original lowercased |
| 220 | lemmatized_phrase = keyphrase_lower |
| 221 | |
| 222 | lemmatized.append((score, keyphrase, lemmatized_phrase)) |
| 223 | |
| 224 | return lemmatized |
| 225 | |
| 226 | def lemmatize_yake_keyphrases_from_doc(yake_keywords, doc): |
| 227 | """Lemmatize YAKE keyphrases using existing spacy doc by finding token offsets.""" |
| 228 | lemmatized = [] |
| 229 | |
| 230 | for score, keyphrase in yake_keywords: |
| 231 | keyphrase_lower = keyphrase.lower() |
| 232 | keyphrase_tokens = keyphrase_lower.split() |
| 233 | |
| 234 | # Try to find matching sequence of tokens in doc |
| 235 | matched_tokens = [] |
| 236 | doc_tokens = [t for t in doc if not t.is_punct] |
| 237 | |
| 238 | for i in range(len(doc_tokens)): |
| 239 | # Check if we have a match starting at position i |
| 240 | temp_tokens = [] |
| 241 | |
| 242 | for j, kp_token in enumerate(keyphrase_tokens): |
| 243 | if i + j < len(doc_tokens): |
| 244 | if doc_tokens[i + j].text.lower() == kp_token: |
| 245 | temp_tokens.append(doc_tokens[i + j]) |
| 246 | else: |
| 247 | temp_tokens.clear() |
| 248 | break |
| 249 | else: |
| 250 | temp_tokens.clear() |
| 251 | break |
| 252 | |
| 253 | if len(temp_tokens) == len(keyphrase_tokens): |
| 254 | matched_tokens = temp_tokens |
| 255 | break |
| 256 | |
| 257 | # Get lemmas from matched tokens |
| 258 | if matched_tokens: |
| 259 | lemmas = [token.lemma_ for token in matched_tokens] |
| 260 | lemmatized_phrase = " ".join(lemmas) |
| 261 | else: |
| 262 | # If no match found, just keep the original lowercased |
| 263 | lemmatized_phrase = keyphrase_lower |
| 264 | |
| 265 | lemmatized.append((score, keyphrase, lemmatized_phrase)) |
| 266 | |
| 267 | return lemmatized |
| 268 | |
| 269 | def analyze_rake_phrase_relationships(rake_keywords, doc): |
| 270 | """Analyze relationships between RAKE keyphrases using spacy doc.""" |
| 271 | # Extract just the keyword text (not scores) |
| 272 | rake_phrases = [keyword.lower() for score, keyword in rake_keywords] |
| 273 | |
| 274 | # Get noun and verb phrases from doc |
| 275 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 276 | |
| 277 | verb_phrases = [] |
| 278 | for token in doc: |
| 279 | if token.pos_ == "VERB": |
| 280 | phrase_tokens = [token.text] |
| 281 | for child in token.children: |
| 282 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 283 | phrase_tokens.append(child.text) |
| 284 | if len(phrase_tokens) > 1: |
| 285 | verb_phrases.append(" ".join(phrase_tokens)) |
| 286 | |
| 287 | # Find phrases that contain 2+ RAKE keyphrases |
| 288 | relationships = [] |
| 289 | |
| 290 | for np in noun_phrases: |
| 291 | np_lower = np.lower() |
| 292 | matching_keyphrases = [kp for kp in rake_phrases if kp in np_lower] |
| 293 | if len(matching_keyphrases) >= 2: |
| 294 | relationships.append({ |
| 295 | 'type': 'noun phrase', |
| 296 | 'phrase': np, |
| 297 | 'keyphrases': matching_keyphrases |
| 298 | }) |
| 299 | |
| 300 | for vp in verb_phrases: |
| 301 | vp_lower = vp.lower() |
| 302 | matching_keyphrases = [kp for kp in rake_phrases if kp in vp_lower] |
| 303 | if len(matching_keyphrases) >= 2: |
| 304 | relationships.append({ |
| 305 | 'type': 'verb phrase', |
| 306 | 'phrase': vp, |
| 307 | 'keyphrases': matching_keyphrases |
| 308 | }) |
| 309 | |
| 310 | return relationships |
| 311 | |
| 312 | def analyze_yake_phrase_relationships(yake_keywords, doc): |
| 313 | """Analyze relationships between YAKE keyphrases using spacy doc.""" |
| 314 | # Extract just the keyword text (not scores) |
| 315 | yake_phrases = [keyword.lower() for score, keyword in yake_keywords] |
| 316 | |
| 317 | # Get noun and verb phrases from doc |
| 318 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 319 | |
| 320 | verb_phrases = [] |
| 321 | for token in doc: |
| 322 | if token.pos_ == "VERB": |
| 323 | phrase_tokens = [token.text] |
| 324 | for child in token.children: |
| 325 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 326 | phrase_tokens.append(child.text) |
| 327 | if len(phrase_tokens) > 1: |
| 328 | verb_phrases.append(" ".join(phrase_tokens)) |
| 329 | |
| 330 | # Find phrases that contain 2+ YAKE keyphrases |
| 331 | relationships = [] |
| 332 | |
| 333 | for np in noun_phrases: |
| 334 | np_lower = np.lower() |
| 335 | matching_keyphrases = [kp for kp in yake_phrases if kp in np_lower] |
| 336 | if len(matching_keyphrases) >= 2: |
| 337 | relationships.append({ |
| 338 | 'type': 'noun phrase', |
| 339 | 'phrase': np, |
| 340 | 'keyphrases': matching_keyphrases |
| 341 | }) |
| 342 | |
| 343 | for vp in verb_phrases: |
| 344 | vp_lower = vp.lower() |
| 345 | matching_keyphrases = [kp for kp in yake_phrases if kp in vp_lower] |
| 346 | if len(matching_keyphrases) >= 2: |
| 347 | relationships.append({ |
| 348 | 'type': 'verb phrase', |
| 349 | 'phrase': vp, |
| 350 | 'keyphrases': matching_keyphrases |
| 351 | }) |
| 352 | |
| 353 | return relationships |
| 354 | |
| 355 | def extract_keyword_relations(doc, message): |
| 356 | """Extract all NOUNS and VERBS as keywords, with up to 2 most important related words each.""" |
| 357 | # Process each sentence separately |
| 358 | sentence_keywords = [] |
| 359 | |
| 360 | for sent in doc.sents: |
| 361 | keywords_data = [] |
| 362 | |
| 363 | for token in sent: |
| 364 | # Only process NOUNS, PROPN, and VERBS |
| 365 | if token.pos_ not in ['NOUN', 'PROPN', 'VERB']: |
| 366 | continue |
| 367 | |
| 368 | keyword_info = { |
| 369 | 'keyword': token.text, |
| 370 | 'pos': token.pos_, |
| 371 | 'lemma': token.lemma_, |
| 372 | 'relations': [] |
| 373 | } |
| 374 | |
| 375 | # Collect potential related words with priority scores |
| 376 | candidates = [] |
| 377 | |
| 378 | if token.pos_ == 'VERB': |
| 379 | # For verbs, prioritize: subject, object, adverbs, prepositional objects |
| 380 | for child in token.children: |
| 381 | if child.dep_ in ['nsubj', 'nsubjpass']: # Subject |
| 382 | candidates.append((3, child, child.dep_, 'subject')) |
| 383 | elif child.dep_ in ['dobj', 'attr']: # Direct object |
| 384 | candidates.append((3, child, child.dep_, 'object')) |
| 385 | elif child.dep_ == 'advmod': # Adverb |
| 386 | candidates.append((2, child, child.dep_, 'adverb')) |
| 387 | elif child.dep_ == 'prep': # Preposition |
| 388 | # Get the object of the preposition |
| 389 | for pchild in child.children: |
| 390 | if pchild.dep_ == 'pobj': |
| 391 | candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj')) |
| 392 | elif child.dep_ in ['iobj', 'dative']: # Indirect object |
| 393 | candidates.append((2, child, child.dep_, 'indirect_obj')) |
| 394 | |
| 395 | elif token.pos_ in ['NOUN', 'PROPN']: |
| 396 | # For nouns, prioritize: adjectives, compound nouns, prep phrases, possessives |
| 397 | for child in token.children: |
| 398 | if child.dep_ == 'amod': # Adjective modifier |
| 399 | candidates.append((3, child, child.dep_, 'adjective')) |
| 400 | elif child.dep_ == 'compound': # Compound noun |
| 401 | candidates.append((3, child, child.dep_, 'compound')) |
| 402 | elif child.dep_ == 'prep': # Preposition |
| 403 | # Get the object of the preposition |
| 404 | for pchild in child.children: |
| 405 | if pchild.dep_ == 'pobj': |
| 406 | candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj')) |
| 407 | elif child.dep_ in ['poss', 'nmod']: # Possessive or nominal modifier |
| 408 | candidates.append((2, child, child.dep_, 'modifier')) |
| 409 | |
| 410 | # Also check if this noun is dependent on something important |
| 411 | if token.head.pos_ == 'VERB': |
| 412 | if token.dep_ in ['nsubj', 'nsubjpass', 'dobj', 'attr']: |
| 413 | candidates.append((1, token.head, token.dep_, 'verb_relation')) |
| 414 | elif token.head.pos_ == 'ADP': # Preposition |
| 415 | # Find what the preposition connects to |
| 416 | if token.head.head.pos_ in ['NOUN', 'PROPN', 'VERB']: |
| 417 | candidates.append((1, token.head.head, f'via_{token.head.text}', 'prep_head')) |
| 418 | |
| 419 | # Sort by priority (higher first) and take top 2 |
| 420 | candidates.sort(key=lambda x: x[0], reverse=True) |
| 421 | for priority, related_token, relation, rel_type in candidates[:2]: |
| 422 | keyword_info['relations'].append({ |
| 423 | 'word': related_token.text, |
| 424 | 'lemma': related_token.lemma_, |
| 425 | 'relation': relation, |
| 426 | 'type': rel_type, |
| 427 | 'pos': related_token.pos_ |
| 428 | }) |
| 429 | |
| 430 | keywords_data.append(keyword_info) |
| 431 | |
| 432 | if keywords_data: |
| 433 | sentence_keywords.append({ |
| 434 | 'sentence': sent.text, |
| 435 | 'keywords': keywords_data |
| 436 | }) |
| 437 | |
| 438 | return sentence_keywords |
| 439 | |
| 440 | def filter_keywords_by_stopwords(keyword_relations): |
| 441 | """ |
| 442 | Filter out common/generic keywords using an expanded stop word list. |
| 443 | Always keeps proper nouns (PROPN). |
| 444 | |
| 445 | Args: |
| 446 | keyword_relations: List of sentence dicts with keywords |
| 447 | |
| 448 | Returns: |
| 449 | Filtered keyword_relations without generic keywords |
| 450 | """ |
| 451 | # Common verbs to exclude |
| 452 | stop_verbs = { |
| 453 | 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see', |
| 454 | 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask', |
| 455 | 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show', |
| 456 | 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run', |
| 457 | 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write', |
| 458 | 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue', |
| 459 | 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop', |
| 460 | 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk', |
| 461 | 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve', |
| 462 | 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill', |
| 463 | 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide' |
| 464 | } |
| 465 | |
| 466 | # Common nouns to exclude |
| 467 | stop_nouns = { |
| 468 | 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman', |
| 469 | 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company', |
| 470 | 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home', |
| 471 | 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot', |
| 472 | 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind', |
| 473 | 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game', |
| 474 | 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president', |
| 475 | 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face', |
| 476 | 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history', |
| 477 | 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy', |
| 478 | 'moment', 'air', 'teacher', 'force', 'education' |
| 479 | } |
| 480 | |
| 481 | filtered_relations = [] |
| 482 | for sent_info in keyword_relations: |
| 483 | filtered_keywords = [] |
| 484 | for kw in sent_info['keywords']: |
| 485 | kept = False |
| 486 | # Always keep proper nouns |
| 487 | if kw['pos'] == 'PROPN': |
| 488 | filtered_keywords.append(kw) |
| 489 | kept = True |
| 490 | # Filter verbs against stop list |
| 491 | elif kw['pos'] == 'VERB' and kw['lemma'].lower() not in stop_verbs: |
| 492 | filtered_keywords.append(kw) |
| 493 | kept = True |
| 494 | # Filter nouns against stop list |
| 495 | elif kw['pos'] == 'NOUN' and kw['lemma'].lower() not in stop_nouns: |
| 496 | filtered_keywords.append(kw) |
| 497 | kept = True |
| 498 | |
| 499 | if filtered_keywords: |
| 500 | filtered_relations.append({ |
| 501 | 'sentence': sent_info['sentence'], |
| 502 | 'keywords': filtered_keywords |
| 503 | }) |
| 504 | |
| 505 | return filtered_relations |
| 506 | |
| 507 | def extract_sentence_word_lists(filtered_keyword_relations): |
| 508 | """ |
| 509 | Extract flat lists of words per sentence from filtered keywords. |
| 510 | Returns lemmatized keywords + their related words, filtered to remove pronouns and stop words. |
| 511 | |
| 512 | Args: |
| 513 | filtered_keyword_relations: Filtered list of sentence dicts with keywords |
| 514 | |
| 515 | Returns: |
| 516 | List of dicts with 'sentence' and 'words' (list of unique lemmas) |
| 517 | """ |
| 518 | # Common verbs to exclude |
| 519 | stop_verbs = { |
| 520 | 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see', |
| 521 | 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask', |
| 522 | 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show', |
| 523 | 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run', |
| 524 | 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write', |
| 525 | 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue', |
| 526 | 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop', |
| 527 | 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk', |
| 528 | 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve', |
| 529 | 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill', |
| 530 | 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide' |
| 531 | } |
| 532 | |
| 533 | # Common nouns to exclude |
| 534 | stop_nouns = { |
| 535 | 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman', |
| 536 | 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company', |
| 537 | 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home', |
| 538 | 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot', |
| 539 | 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind', |
| 540 | 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game', |
| 541 | 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president', |
| 542 | 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face', |
| 543 | 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history', |
| 544 | 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy', |
| 545 | 'moment', 'air', 'teacher', 'force', 'education' |
| 546 | } |
| 547 | |
| 548 | # Pronouns to exclude (lemmatized forms) |
| 549 | pronouns = { |
| 550 | 'i', 'you', 'he', 'she', 'it', 'we', 'they', |
| 551 | 'me', 'him', 'her', 'us', 'them', |
| 552 | 'my', 'your', 'his', 'her', 'its', 'our', 'their', |
| 553 | 'mine', 'yours', 'hers', 'ours', 'theirs', |
| 554 | 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', |
| 555 | 'this', 'that', 'these', 'those', |
| 556 | 'who', 'whom', 'whose', 'which', 'what', |
| 557 | 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything', |
| 558 | 'nobody', 'nothing', 'somebody', 'someone', 'something' |
| 559 | } |
| 560 | |
| 561 | sentence_lists = [] |
| 562 | |
| 563 | for sent_info in filtered_keyword_relations: |
| 564 | words = set() |
| 565 | |
| 566 | for kw in sent_info['keywords']: |
| 567 | lemma_lower = kw['lemma'].lower() |
| 568 | pos = kw['pos'] |
| 569 | |
| 570 | # Skip pronouns |
| 571 | if pos == 'PRON' or lemma_lower in pronouns: |
| 572 | continue |
| 573 | |
| 574 | # Skip stop words (but keep PROPN) |
| 575 | if pos == 'PROPN': |
| 576 | words.add(kw['lemma']) |
| 577 | elif pos == 'VERB' and lemma_lower not in stop_verbs: |
| 578 | words.add(kw['lemma']) |
| 579 | elif pos == 'NOUN' and lemma_lower not in stop_nouns: |
| 580 | words.add(kw['lemma']) |
| 581 | |
| 582 | # Add filtered related words |
| 583 | for rel in kw['relations']: |
| 584 | rel_lemma_lower = rel['lemma'].lower() |
| 585 | rel_pos = rel['pos'] |
| 586 | |
| 587 | # Skip pronouns |
| 588 | if rel_pos == 'PRON' or rel_lemma_lower in pronouns: |
| 589 | continue |
| 590 | |
| 591 | # Skip stop words (but keep PROPN) |
| 592 | if rel_pos == 'PROPN': |
| 593 | words.add(rel['lemma']) |
| 594 | elif rel_pos == 'VERB' and rel_lemma_lower not in stop_verbs: |
| 595 | words.add(rel['lemma']) |
| 596 | elif rel_pos == 'NOUN' and rel_lemma_lower not in stop_nouns: |
| 597 | words.add(rel['lemma']) |
| 598 | # Also keep adjectives and adverbs |
| 599 | elif rel_pos in ['ADJ', 'ADV']: |
| 600 | words.add(rel['lemma']) |
| 601 | |
| 602 | sentence_lists.append({ |
| 603 | 'sentence': sent_info['sentence'], |
| 604 | 'words': sorted(list(words)) # Sort for consistent output |
| 605 | }) |
| 606 | |
| 607 | return sentence_lists |
| 608 | |
| 609 | def is_common_phrase(phrase): |
| 610 | """Check if phrase contains only common words (no proper nouns or unusual words).""" |
| 611 | doc = nlp(phrase) |
| 612 | for token in doc: |
| 613 | # Skip punctuation |
| 614 | if token.is_punct: |
| 615 | continue |
| 616 | # Check if it's a proper noun |
| 617 | if token.pos_ == "PROPN": |
| 618 | return False |
| 619 | # Check if it's an unusual word (not in common vocabulary) |
| 620 | # Use combination of: not a stop word, and has low frequency rank |
| 621 | if not token.is_stop and token.is_alpha: |
| 622 | # If word has very high rank (rare) or no rank info, consider it unusual |
| 623 | if token.rank > 10000 or token.rank == 0: |
| 624 | return False |
| 625 | return True |
| 626 | |
| 627 | def print_keywords(keywords, time_taken, file, extractor_name="RAKE", lemmatized_data=None): |
| 628 | """Write extracted keywords to file.""" |
| 629 | if keywords: |
| 630 | file.write(f"{extractor_name} Keywords:\n") |
| 631 | |
| 632 | if lemmatized_data: |
| 633 | # Print with lemmatized forms |
| 634 | for score, original, lemmatized in lemmatized_data: |
| 635 | lemma_suffix = f" → {lemmatized}" if lemmatized != original.lower() else "" |
| 636 | file.write(f" - {original} (score: {score:.2f}){lemma_suffix}\n") |
| 637 | else: |
| 638 | # Print without lemmatized forms |
| 639 | for score, word in keywords: |
| 640 | file.write(f" - {word} (score: {score:.2f})\n") |
| 641 | |
| 642 | file.write(f"Total keywords: {len(keywords)} extracted in {time_taken:.4f} seconds\n") |
| 643 | else: |
| 644 | file.write(f"No {extractor_name} keywords extracted\n") |
| 645 | |
| 646 | def print_named_entities(entities, file): |
| 647 | """Write extracted named entities to file.""" |
| 648 | if entities: |
| 649 | file.write("\nNamed Entities:\n") |
| 650 | for entity_text, entity_type in entities: |
| 651 | file.write(f" - {entity_text} ({entity_type})\n") |
| 652 | file.write(f"Total named entities: {len(entities)}\n") |
| 653 | else: |
| 654 | file.write("\nNo named entities extracted\n") |
| 655 | |
| 656 | def print_phrases(noun_phrases, verb_phrases, file): |
| 657 | """Write extracted phrases to file.""" |
| 658 | file.write("\nNoun Phrases:\n") |
| 659 | if noun_phrases: |
| 660 | for phrase in noun_phrases: |
| 661 | file.write(f" - {phrase}\n") |
| 662 | file.write(f"Total noun phrases: {len(noun_phrases)}\n") |
| 663 | else: |
| 664 | file.write(" No noun phrases extracted\n") |
| 665 | |
| 666 | file.write("\nVerb Phrases:\n") |
| 667 | if verb_phrases: |
| 668 | for phrase in verb_phrases: |
| 669 | file.write(f" - {phrase}\n") |
| 670 | file.write(f"Total verb phrases: {len(verb_phrases)}\n") |
| 671 | else: |
| 672 | file.write(" No verb phrases extracted\n") |
| 673 | |
| 674 | def print_dependencies(sentences_deps, time_taken, file): |
| 675 | """Write dependency relations to file.""" |
| 676 | file.write(f"\nDependency Relations (extracted in {time_taken:.4f}sec):\n") |
| 677 | if sentences_deps: |
| 678 | for i, sent_info in enumerate(sentences_deps, 1): |
| 679 | file.write(f"\n Sentence {i}: {sent_info['sentence']}\n") |
| 680 | |
| 681 | # Print all dependencies first |
| 682 | file.write(" Dependencies:\n") |
| 683 | for dep in sent_info['dependencies']: |
| 684 | file.write(f" {dep['token']} ({dep['pos']}) --[{dep['dep']}]--> {dep['head']} ({dep['head_pos']})\n") |
| 685 | |
| 686 | # Print verb relations (verb with its noun and adverb dependencies) |
| 687 | if sent_info.get('verb_relations'): |
| 688 | file.write(" Verb Relations:\n") |
| 689 | for vr in sent_info['verb_relations']: |
| 690 | nouns_str = ', '.join(vr['nouns']) if vr['nouns'] else 'none' |
| 691 | advs_str = ', '.join(vr['advs']) if vr['advs'] else 'none' |
| 692 | file.write(f" VERB: {vr['verb']} | NOUNs: {nouns_str} | ADVs: {advs_str}\n") |
| 693 | |
| 694 | # Print independent nouns (not dependent on any verb) |
| 695 | if sent_info.get('independent_nouns'): |
| 696 | file.write(" Independent NOUNs (not verb-dependent):\n") |
| 697 | for noun_info in sent_info['independent_nouns']: |
| 698 | file.write(f" {noun_info['noun']} --[{noun_info['dep']}]--> {noun_info['head']} ({noun_info['head_pos']})") |
| 699 | # If noun depends on preposition, show what preposition depends on |
| 700 | if 'prep_head' in noun_info: |
| 701 | file.write(f" --[{noun_info['prep_dep']}]--> {noun_info['prep_head']} ({noun_info['prep_head_pos']})") |
| 702 | file.write("\n") |
| 703 | |
| 704 | file.write(f"\nTotal sentences: {len(sentences_deps)}\n") |
| 705 | else: |
| 706 | file.write(" No dependencies extracted\n") |
| 707 | |
| 708 | def print_keyword_relationships(keyword_relations, file, header="Keyword Relationships (All NOUNs and VERBs)"): |
| 709 | """Write keyword relationship analysis to file.""" |
| 710 | file.write(f"\n{header}:\n") |
| 711 | |
| 712 | if not keyword_relations: |
| 713 | file.write(" No keywords found\n") |
| 714 | return |
| 715 | |
| 716 | for sent_info in keyword_relations: |
| 717 | file.write(f"\n Sentence: {sent_info['sentence']}\n") |
| 718 | file.write(f" Keywords with relations:\n") |
| 719 | |
| 720 | for kw in sent_info['keywords']: |
| 721 | related_words = [] |
| 722 | for rel in kw['relations']: |
| 723 | related_words.append(f"{rel['lemma']}({rel['type']})") |
| 724 | |
| 725 | related_str = ', '.join(related_words) if related_words else 'none' |
| 726 | file.write(f" {kw['lemma']} [{kw['pos']}]: {related_str}\n") |
| 727 | |
| 728 | count = 0 |
| 729 | rake_total_time = 0 |
| 730 | yake_total_time = 0 |
| 731 | keybert_total_time = 0 |
| 732 | dep_total_time = 0 |
| 733 | total_keyword_density = 0.0 |
| 734 | all_keywords = set() # Track unique keywords across all messages |
| 735 | total_words = 0 # Track total words across all messages |
| 736 | |
| 737 | with open(output_file, 'w', encoding='utf-8') as f: |
| 738 | for item in rawData: |
| 739 | # Get message text |
| 740 | message = item['speaker'] + ": " + item['content'] |
| 741 | |
| 742 | # Time RAKE extraction |
| 743 | rake_start = time.time() |
| 744 | rake_keywords = extract_keywords_rake(message) |
| 745 | rake_end = time.time() |
| 746 | rake_time = rake_end - rake_start |
| 747 | rake_total_time += rake_time |
| 748 | |
| 749 | # Time YAKE extraction |
| 750 | yake_start = time.time() |
| 751 | yake_keywords = extract_keywords_yake(message) |
| 752 | yake_end = time.time() |
| 753 | yake_time = yake_end - yake_start |
| 754 | yake_total_time += yake_time |
| 755 | |
| 756 | # Time KeyBERT extraction |
| 757 | keybert_start = time.time() |
| 758 | keybert_keywords = extract_keywords_keybert(message) |
| 759 | keybert_end = time.time() |
| 760 | keybert_time = keybert_end - keybert_start |
| 761 | keybert_total_time += keybert_time |
| 762 | |
| 763 | # Time dependency extraction (also returns doc for analysis) |
| 764 | dep_start = time.time() |
| 765 | dependencies, spacy_doc = extract_dependencies(message) |
| 766 | dep_end = time.time() |
| 767 | dep_time = dep_end - dep_start |
| 768 | dep_total_time += dep_time |
| 769 | |
| 770 | # Lemmatize RAKE keyphrases using the already-computed spacy doc |
| 771 | rake_lemmatized = lemmatize_rake_keyphrases_from_doc(rake_keywords, spacy_doc) |
| 772 | |
| 773 | # Lemmatize YAKE keyphrases using the already-computed spacy doc |
| 774 | yake_lemmatized = lemmatize_yake_keyphrases_from_doc(yake_keywords, spacy_doc) |
| 775 | |
| 776 | # Analyze RAKE keyphrase relationships using the spacy doc |
| 777 | rake_relationships = analyze_rake_phrase_relationships(rake_keywords, spacy_doc) |
| 778 | |
| 779 | # Analyze YAKE keyphrase relationships using the spacy doc |
| 780 | yake_relationships = analyze_yake_phrase_relationships(yake_keywords, spacy_doc) |
| 781 | |
| 782 | # Find keyword relationships for indexing |
| 783 | keyword_relationships = extract_keyword_relations(spacy_doc, message) |
| 784 | |
| 785 | # Filter keywords using stop words |
| 786 | filtered_keyword_relationships = filter_keywords_by_stopwords(keyword_relationships) |
| 787 | |
| 788 | # Extract named entities |
| 789 | # entities = extract_named_entities(message) |
| 790 | |
| 791 | # Extract noun and verb phrases |
| 792 | #noun_phrases, verb_phrases = extract_phrases(message) |
| 793 | |
| 794 | # Write original message |
| 795 | f.write(f"\n{'='*80}\n") |
| 796 | f.write(f"Message {count + 1}:\n") |
| 797 | f.write(message + "\n") |
| 798 | f.write('-' * 80 + "\n") |
| 799 | |
| 800 | if verbose: |
| 801 | # Write RAKE keywords with lemmatization |
| 802 | print_keywords(rake_keywords, rake_time, f, "RAKE", lemmatized_data=rake_lemmatized) |
| 803 | |
| 804 | # Write YAKE keywords with lemmatization |
| 805 | f.write("\n") |
| 806 | print_keywords(yake_keywords, yake_time, f, "YAKE", lemmatized_data=yake_lemmatized) |
| 807 | |
| 808 | # Write KeyBERT keywords |
| 809 | f.write("\n") |
| 810 | print_keywords(keybert_keywords, keybert_time, f, "KeyBERT") |
| 811 | |
| 812 | # Write dependencies |
| 813 | print_dependencies(dependencies, dep_time, f) |
| 814 | |
| 815 | # Write keyword relationships for indexing (unfiltered) |
| 816 | print_keyword_relationships(keyword_relationships, f) |
| 817 | |
| 818 | # Write filtered keyword relationships for indexing |
| 819 | print_keyword_relationships(filtered_keyword_relationships, f, header="Filtered Keywords (Stop Words Removed)") |
| 820 | |
| 821 | # Extract and print flat word lists per sentence |
| 822 | sentence_word_lists = extract_sentence_word_lists(filtered_keyword_relationships) |
| 823 | f.write(f"\nSentence Word Lists (Keywords + Relations):\n") |
| 824 | for sent_list in sentence_word_lists: |
| 825 | f.write(f" {sent_list['words']}\n") |
| 826 | |
| 827 | # Calculate and print word statistics |
| 828 | # Count unique words across all sentence lists |
| 829 | all_keywords_this_message = set() |
| 830 | for sent_list in sentence_word_lists: |
| 831 | all_keywords_this_message.update(sent_list['words']) |
| 832 | |
| 833 | # Update global tracking |
| 834 | all_keywords.update(all_keywords_this_message) |
| 835 | |
| 836 | # Count total words in original message |
| 837 | message_word_count = len(message.split()) |
| 838 | total_words += message_word_count |
| 839 | unique_keyword_count = len(all_keywords_this_message) |
| 840 | |
| 841 | f.write(f"\nWord Statistics:\n") |
| 842 | f.write(f" Total words in message: {message_word_count}\n") |
| 843 | f.write(f" Unique filtered keywords: {unique_keyword_count}\n") |
| 844 | if message_word_count > 0: |
| 845 | percentage = (unique_keyword_count / message_word_count) * 100 |
| 846 | f.write(f" Keyword density: {percentage:.1f}%\n") |
| 847 | total_keyword_density += percentage |
| 848 | |
| 849 | # Write timing comparison (only if verbose) |
| 850 | if verbose: |
| 851 | f.write(f"\nTiming Comparison:\n") |
| 852 | f.write(f" RAKE: {rake_time:.4f}sec\n") |
| 853 | f.write(f" YAKE: {yake_time:.4f}sec\n") |
| 854 | f.write(f" KeyBERT: {keybert_time:.4f}sec\n") |
| 855 | f.write(f" Dependencies: {dep_time:.4f}sec\n") |
| 856 | times = {'RAKE': rake_time, 'YAKE': yake_time, 'KeyBERT': keybert_time, 'Dependencies': dep_time} |
| 857 | fastest = min(times, key=times.get) |
| 858 | f.write(f" Fastest: {fastest}\n") |
| 859 | |
| 860 | # Write named entities |
| 861 | #print_named_entities(entities, f) |
| 862 | |
| 863 | # Write phrases |
| 864 | #print_phrases(noun_phrases, verb_phrases, f) |
| 865 | |
| 866 | count += 1 |
| 867 | |
| 868 | # Print progress indicator every 50 messages |
| 869 | if count % 50 == 0: |
| 870 | print(f"Progress: Processed {count} messages...") |
| 871 | |
| 872 | # Write overall timing summary |
| 873 | f.write(f"\n\n{'='*80}\n") |
| 874 | f.write(f"OVERALL TIMING SUMMARY ({count} messages):\n") |
| 875 | f.write(f"{'='*80}\n") |
| 876 | f.write(f"Total RAKE time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec per message)\n") |
| 877 | f.write(f"Total YAKE time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec per message)\n") |
| 878 | f.write(f"Total KeyBERT time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec per message)\n") |
| 879 | f.write(f"Total Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec per message)\n") |
| 880 | |
| 881 | avg_keyword_density = total_keyword_density / count if count > 0 else 0 |
| 882 | f.write(f"\nAverage keyword density: {avg_keyword_density:.1f}%\n") |
| 883 | f.write(f"\nTotal words across all messages: {total_words}\n") |
| 884 | f.write(f"Unique keywords across all messages: {len(all_keywords)}\n") |
| 885 | if total_words > 0: |
| 886 | overall_density = (len(all_keywords) / total_words) * 100 |
| 887 | f.write(f"Overall keyword density: {overall_density:.1f}%\n") |
| 888 | |
| 889 | total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time} |
| 890 | fastest = min(total_times, key=total_times.get) |
| 891 | slowest = max(total_times, key=total_times.get) |
| 892 | f.write(f"\nOverall fastest: {fastest}\n") |
| 893 | f.write(f"Overall slowest: {slowest}\n") |
| 894 | speedup = total_times[slowest] / total_times[fastest] |
| 895 | f.write(f"Speedup factor (fastest vs slowest): {speedup:.2f}x\n") |
| 896 | |
| 897 | print(f"Extraction complete. Results written to {output_file}") |
| 898 | print(f"Processed {count} messages") |
| 899 | print(f"RAKE total time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec)") |
| 900 | print(f"YAKE total time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec)") |
| 901 | print(f"KeyBERT total time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec)") |
| 902 | print(f"Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec)") |
| 903 | total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time} |
| 904 | fastest = min(total_times, key=total_times.get) |
| 905 | print(f"Overall fastest: {fastest}") |
| 906 | |