microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
python/fineTuning/unsloth/nltkExtract.py
918lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | from rake_nltk import Rake |
| 5 | import sys |
| 6 | import os |
| 7 | from argparse import ArgumentParser |
| 8 | import nltk |
| 9 | from nltk import word_tokenize, pos_tag, ne_chunk |
| 10 | from nltk.tree import Tree |
| 11 | import spacy |
| 12 | import yake |
| 13 | from keybert import KeyBERT |
| 14 | |
| 15 | parser = ArgumentParser(description="Extract keywords from dataset using NLTK-RAKE.") |
| 16 | parser.add_argument("--dataset_path", type=str, default='/data/npr/npr_chunks_no_embedding.json', help="Path to the dataset file.") |
| 17 | parser.add_argument("--max_length", type=int, default=1, help="Maximum number of words in a keyword phrase.") |
| 18 | parser.add_argument("--output_file", type=str, default='extraction.txt', help="Path to the output file.") |
| 19 | parser.add_argument("--verbose", action='store_true', help="Enable verbose output (shows all extraction details). Default is non-verbose.") |
| 20 | parser.add_argument("--nogpu", action='store_true', help="Force KeyBERT to use CPU instead of GPU. Default is false (use GPU if available).") |
| 21 | args = parser.parse_args(sys.argv[1:]) |
| 22 | dataset_path = args.dataset_path |
| 23 | max_length = args.max_length |
| 24 | output_file = args.output_file |
| 25 | verbose = args.verbose |
| 26 | nogpu = args.nogpu |
| 27 | |
| 28 | # Initialize RAKE with max_length configuration |
| 29 | rake = Rake(max_length=max_length) |
| 30 | |
| 31 | # Initialize YAKE keyword extractor |
| 32 | # Parameters: language, max_ngram_size, deduplication_threshold, number of keywords |
| 33 | yake_extractor = yake.KeywordExtractor(lan="en", n=max_length, dedupLim=0.9, top=20) |
| 34 | |
| 35 | # Initialize KeyBERT model |
| 36 | if nogpu: |
| 37 | from sentence_transformers import SentenceTransformer |
| 38 | # Force sentence transformer to use CPU |
| 39 | model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') |
| 40 | keybert_model = KeyBERT(model=model) |
| 41 | print('Using KeyBERT on CPU') |
| 42 | else: |
| 43 | keybert_model = KeyBERT() |
| 44 | |
| 45 | # Load spacy model with only necessary components |
| 46 | # Keep: tok2vec, tagger, parser, lemmatizer |
| 47 | # Disable: ner, attribute_ruler, and any other unused components |
| 48 | if nogpu: |
| 49 | # Force spaCy to use CPU |
| 50 | spacy.require_cpu() |
| 51 | print('Using spaCy on CPU') |
| 52 | nlp = spacy.load("en_core_web_sm", disable=["ner"]) |
| 53 | |
| 54 | # Load an array of JSON objects with properties like speaker and content |
| 55 | import json |
| 56 | import time |
| 57 | |
| 58 | with open(dataset_path) as f: |
| 59 | rawData = json.load(f) |
| 60 | |
| 61 | def extract_keywords_rake(text): |
| 62 | """Extract keywords from text using RAKE.""" |
| 63 | rake.extract_keywords_from_text(text) |
| 64 | |
| 65 | # Get ranked phrases with scores (already limited by max_length configuration) |
| 66 | keywords = rake.get_ranked_phrases_with_scores() |
| 67 | |
| 68 | return keywords |
| 69 | |
| 70 | def extract_keywords_yake(text): |
| 71 | """Extract keywords from text using YAKE.""" |
| 72 | # YAKE returns (keyword, score) where lower scores are better |
| 73 | # Reverse to (score, keyword) to match RAKE format |
| 74 | keywords = [(score, keyword) for keyword, score in yake_extractor.extract_keywords(text)] |
| 75 | |
| 76 | return keywords |
| 77 | |
| 78 | def extract_keywords_keybert(text): |
| 79 | """Extract keywords from text using KeyBERT.""" |
| 80 | # KeyBERT returns (keyword, score) where higher scores are better |
| 81 | # Reverse to (score, keyword) to match RAKE format |
| 82 | keywords = keybert_model.extract_keywords(text, keyphrase_ngram_range=(1, max_length), top_n=20) |
| 83 | keywords = [(score, keyword) for keyword, score in keywords] |
| 84 | |
| 85 | return keywords |
| 86 | |
| 87 | def extract_named_entities(text): |
| 88 | """Extract named entities from text using NLTK.""" |
| 89 | # Tokenize and tag parts of speech |
| 90 | tokens = word_tokenize(text) |
| 91 | pos_tags = pos_tag(tokens) |
| 92 | |
| 93 | # Extract named entities |
| 94 | named_entities = ne_chunk(pos_tags, binary=False) |
| 95 | |
| 96 | # Parse the tree to extract entities with their types |
| 97 | entities = [] |
| 98 | for chunk in named_entities: |
| 99 | if isinstance(chunk, Tree): |
| 100 | entity_type = chunk.label() |
| 101 | entity_text = " ".join([token for token, pos in chunk.leaves()]) |
| 102 | entities.append((entity_text, entity_type)) |
| 103 | |
| 104 | return entities |
| 105 | |
| 106 | def extract_phrases(text): |
| 107 | """Extract noun phrases and verb phrases from text using spacy.""" |
| 108 | doc = nlp(text) |
| 109 | |
| 110 | # Extract noun phrases |
| 111 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 112 | |
| 113 | # Extract verb phrases (tokens with verb POS and their dependents) |
| 114 | verb_phrases = [] |
| 115 | for token in doc: |
| 116 | if token.pos_ == "VERB": |
| 117 | # Get the verb and its direct object/complement |
| 118 | phrase_tokens = [token.text] |
| 119 | for child in token.children: |
| 120 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 121 | phrase_tokens.append(child.text) |
| 122 | if len(phrase_tokens) > 1: |
| 123 | verb_phrases.append(" ".join(phrase_tokens)) |
| 124 | else: |
| 125 | verb_phrases.append(token.text) |
| 126 | |
| 127 | return noun_phrases, verb_phrases |
| 128 | |
| 129 | def extract_dependencies(text): |
| 130 | """Extract dependency relations from text using spacy with sentence boundaries.""" |
| 131 | doc = nlp(text) |
| 132 | |
| 133 | # Process each sentence separately for better dependency analysis |
| 134 | sentences_deps = [] |
| 135 | for sent in doc.sents: |
| 136 | sent_deps = [] |
| 137 | verb_relations = [] |
| 138 | nouns_in_verb_relations = set() |
| 139 | |
| 140 | for token in sent: |
| 141 | # Extract: token, POS tag, dependency relation, head token |
| 142 | dep_info = { |
| 143 | 'token': token.text, |
| 144 | 'pos': token.pos_, |
| 145 | 'dep': token.dep_, |
| 146 | 'head': token.head.text, |
| 147 | 'head_pos': token.head.pos_ |
| 148 | } |
| 149 | sent_deps.append(dep_info) |
| 150 | |
| 151 | # If token is a verb, extract its NOUN and ADV dependencies |
| 152 | if token.pos_ == "VERB": |
| 153 | nouns = [] |
| 154 | advs = [] |
| 155 | for child in token.children: |
| 156 | if child.pos_ == "NOUN" or child.pos_ == "PROPN": |
| 157 | nouns.append(child.text) |
| 158 | nouns_in_verb_relations.add(child.text) |
| 159 | elif child.pos_ == "ADV": |
| 160 | advs.append(child.text) |
| 161 | |
| 162 | verb_relations.append({ |
| 163 | 'verb': token.text, |
| 164 | 'nouns': nouns, |
| 165 | 'advs': advs |
| 166 | }) |
| 167 | |
| 168 | # Find nouns that are NOT dependent on any verb |
| 169 | independent_nouns = [] |
| 170 | for token in sent: |
| 171 | if (token.pos_ == "NOUN" or token.pos_ == "PROPN") and token.text not in nouns_in_verb_relations: |
| 172 | noun_info = { |
| 173 | 'noun': token.text, |
| 174 | 'dep': token.dep_, |
| 175 | 'head': token.head.text, |
| 176 | 'head_pos': token.head.pos_ |
| 177 | } |
| 178 | |
| 179 | # If the noun depends on a preposition, also show what the preposition depends on |
| 180 | if token.head.pos_ == "ADP": |
| 181 | noun_info['prep_head'] = token.head.head.text |
| 182 | noun_info['prep_head_pos'] = token.head.head.pos_ |
| 183 | noun_info['prep_dep'] = token.head.dep_ |
| 184 | |
| 185 | independent_nouns.append(noun_info) |
| 186 | |
| 187 | sentences_deps.append({ |
| 188 | 'sentence': sent.text, |
| 189 | 'dependencies': sent_deps, |
| 190 | 'verb_relations': verb_relations, |
| 191 | 'independent_nouns': independent_nouns |
| 192 | }) |
| 193 | |
| 194 | return sentences_deps, doc |
| 195 | |
| 196 | def lemmatize_rake_keyphrases_from_doc(rake_keywords, doc): |
| 197 | """Lemmatize RAKE keyphrases using existing spacy doc by finding token offsets.""" |
| 198 | lemmatized = [] |
| 199 | |
| 200 | for score, keyphrase in rake_keywords: |
| 201 | keyphrase_lower = keyphrase.lower() |
| 202 | keyphrase_tokens = keyphrase_lower.split() |
| 203 | |
| 204 | # Try to find matching sequence of tokens in doc |
| 205 | matched_tokens = [] |
| 206 | doc_tokens = [t for t in doc if not t.is_punct] |
| 207 | |
| 208 | for i in range(len(doc_tokens)): |
| 209 | # Check if we have a match starting at position i |
| 210 | temp_tokens = [] |
| 211 | |
| 212 | for j, kp_token in enumerate(keyphrase_tokens): |
| 213 | if i + j < len(doc_tokens): |
| 214 | if doc_tokens[i + j].text.lower() == kp_token: |
| 215 | temp_tokens.append(doc_tokens[i + j]) |
| 216 | else: |
| 217 | temp_tokens.clear() |
| 218 | break |
| 219 | else: |
| 220 | temp_tokens.clear() |
| 221 | break |
| 222 | |
| 223 | if len(temp_tokens) == len(keyphrase_tokens): |
| 224 | matched_tokens = temp_tokens |
| 225 | break |
| 226 | |
| 227 | # Get lemmas from matched tokens |
| 228 | if matched_tokens: |
| 229 | lemmas = [token.lemma_ for token in matched_tokens] |
| 230 | lemmatized_phrase = " ".join(lemmas) |
| 231 | else: |
| 232 | # If no match found, just keep the original lowercased |
| 233 | lemmatized_phrase = keyphrase_lower |
| 234 | |
| 235 | lemmatized.append((score, keyphrase, lemmatized_phrase)) |
| 236 | |
| 237 | return lemmatized |
| 238 | |
| 239 | def lemmatize_yake_keyphrases_from_doc(yake_keywords, doc): |
| 240 | """Lemmatize YAKE keyphrases using existing spacy doc by finding token offsets.""" |
| 241 | lemmatized = [] |
| 242 | |
| 243 | for score, keyphrase in yake_keywords: |
| 244 | keyphrase_lower = keyphrase.lower() |
| 245 | keyphrase_tokens = keyphrase_lower.split() |
| 246 | |
| 247 | # Try to find matching sequence of tokens in doc |
| 248 | matched_tokens = [] |
| 249 | doc_tokens = [t for t in doc if not t.is_punct] |
| 250 | |
| 251 | for i in range(len(doc_tokens)): |
| 252 | # Check if we have a match starting at position i |
| 253 | temp_tokens = [] |
| 254 | |
| 255 | for j, kp_token in enumerate(keyphrase_tokens): |
| 256 | if i + j < len(doc_tokens): |
| 257 | if doc_tokens[i + j].text.lower() == kp_token: |
| 258 | temp_tokens.append(doc_tokens[i + j]) |
| 259 | else: |
| 260 | temp_tokens.clear() |
| 261 | break |
| 262 | else: |
| 263 | temp_tokens.clear() |
| 264 | break |
| 265 | |
| 266 | if len(temp_tokens) == len(keyphrase_tokens): |
| 267 | matched_tokens = temp_tokens |
| 268 | break |
| 269 | |
| 270 | # Get lemmas from matched tokens |
| 271 | if matched_tokens: |
| 272 | lemmas = [token.lemma_ for token in matched_tokens] |
| 273 | lemmatized_phrase = " ".join(lemmas) |
| 274 | else: |
| 275 | # If no match found, just keep the original lowercased |
| 276 | lemmatized_phrase = keyphrase_lower |
| 277 | |
| 278 | lemmatized.append((score, keyphrase, lemmatized_phrase)) |
| 279 | |
| 280 | return lemmatized |
| 281 | |
| 282 | def analyze_rake_phrase_relationships(rake_keywords, doc): |
| 283 | """Analyze relationships between RAKE keyphrases using spacy doc.""" |
| 284 | # Extract just the keyword text (not scores) |
| 285 | rake_phrases = [keyword.lower() for score, keyword in rake_keywords] |
| 286 | |
| 287 | # Get noun and verb phrases from doc |
| 288 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 289 | |
| 290 | verb_phrases = [] |
| 291 | for token in doc: |
| 292 | if token.pos_ == "VERB": |
| 293 | phrase_tokens = [token.text] |
| 294 | for child in token.children: |
| 295 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 296 | phrase_tokens.append(child.text) |
| 297 | if len(phrase_tokens) > 1: |
| 298 | verb_phrases.append(" ".join(phrase_tokens)) |
| 299 | |
| 300 | # Find phrases that contain 2+ RAKE keyphrases |
| 301 | relationships = [] |
| 302 | |
| 303 | for np in noun_phrases: |
| 304 | np_lower = np.lower() |
| 305 | matching_keyphrases = [kp for kp in rake_phrases if kp in np_lower] |
| 306 | if len(matching_keyphrases) >= 2: |
| 307 | relationships.append({ |
| 308 | 'type': 'noun phrase', |
| 309 | 'phrase': np, |
| 310 | 'keyphrases': matching_keyphrases |
| 311 | }) |
| 312 | |
| 313 | for vp in verb_phrases: |
| 314 | vp_lower = vp.lower() |
| 315 | matching_keyphrases = [kp for kp in rake_phrases if kp in vp_lower] |
| 316 | if len(matching_keyphrases) >= 2: |
| 317 | relationships.append({ |
| 318 | 'type': 'verb phrase', |
| 319 | 'phrase': vp, |
| 320 | 'keyphrases': matching_keyphrases |
| 321 | }) |
| 322 | |
| 323 | return relationships |
| 324 | |
| 325 | def analyze_yake_phrase_relationships(yake_keywords, doc): |
| 326 | """Analyze relationships between YAKE keyphrases using spacy doc.""" |
| 327 | # Extract just the keyword text (not scores) |
| 328 | yake_phrases = [keyword.lower() for score, keyword in yake_keywords] |
| 329 | |
| 330 | # Get noun and verb phrases from doc |
| 331 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] |
| 332 | |
| 333 | verb_phrases = [] |
| 334 | for token in doc: |
| 335 | if token.pos_ == "VERB": |
| 336 | phrase_tokens = [token.text] |
| 337 | for child in token.children: |
| 338 | if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"): |
| 339 | phrase_tokens.append(child.text) |
| 340 | if len(phrase_tokens) > 1: |
| 341 | verb_phrases.append(" ".join(phrase_tokens)) |
| 342 | |
| 343 | # Find phrases that contain 2+ YAKE keyphrases |
| 344 | relationships = [] |
| 345 | |
| 346 | for np in noun_phrases: |
| 347 | np_lower = np.lower() |
| 348 | matching_keyphrases = [kp for kp in yake_phrases if kp in np_lower] |
| 349 | if len(matching_keyphrases) >= 2: |
| 350 | relationships.append({ |
| 351 | 'type': 'noun phrase', |
| 352 | 'phrase': np, |
| 353 | 'keyphrases': matching_keyphrases |
| 354 | }) |
| 355 | |
| 356 | for vp in verb_phrases: |
| 357 | vp_lower = vp.lower() |
| 358 | matching_keyphrases = [kp for kp in yake_phrases if kp in vp_lower] |
| 359 | if len(matching_keyphrases) >= 2: |
| 360 | relationships.append({ |
| 361 | 'type': 'verb phrase', |
| 362 | 'phrase': vp, |
| 363 | 'keyphrases': matching_keyphrases |
| 364 | }) |
| 365 | |
| 366 | return relationships |
| 367 | |
| 368 | def extract_keyword_relations(doc, message): |
| 369 | """Extract all NOUNS and VERBS as keywords, with up to 2 most important related words each.""" |
| 370 | # Process each sentence separately |
| 371 | sentence_keywords = [] |
| 372 | |
| 373 | for sent in doc.sents: |
| 374 | keywords_data = [] |
| 375 | |
| 376 | for token in sent: |
| 377 | # Only process NOUNS, PROPN, and VERBS |
| 378 | if token.pos_ not in ['NOUN', 'PROPN', 'VERB']: |
| 379 | continue |
| 380 | |
| 381 | keyword_info = { |
| 382 | 'keyword': token.text, |
| 383 | 'pos': token.pos_, |
| 384 | 'lemma': token.lemma_, |
| 385 | 'relations': [] |
| 386 | } |
| 387 | |
| 388 | # Collect potential related words with priority scores |
| 389 | candidates = [] |
| 390 | |
| 391 | if token.pos_ == 'VERB': |
| 392 | # For verbs, prioritize: subject, object, adverbs, prepositional objects |
| 393 | for child in token.children: |
| 394 | if child.dep_ in ['nsubj', 'nsubjpass']: # Subject |
| 395 | candidates.append((3, child, child.dep_, 'subject')) |
| 396 | elif child.dep_ in ['dobj', 'attr']: # Direct object |
| 397 | candidates.append((3, child, child.dep_, 'object')) |
| 398 | elif child.dep_ == 'advmod': # Adverb |
| 399 | candidates.append((2, child, child.dep_, 'adverb')) |
| 400 | elif child.dep_ == 'prep': # Preposition |
| 401 | # Get the object of the preposition |
| 402 | for pchild in child.children: |
| 403 | if pchild.dep_ == 'pobj': |
| 404 | candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj')) |
| 405 | elif child.dep_ in ['iobj', 'dative']: # Indirect object |
| 406 | candidates.append((2, child, child.dep_, 'indirect_obj')) |
| 407 | |
| 408 | elif token.pos_ in ['NOUN', 'PROPN']: |
| 409 | # For nouns, prioritize: adjectives, compound nouns, prep phrases, possessives |
| 410 | for child in token.children: |
| 411 | if child.dep_ == 'amod': # Adjective modifier |
| 412 | candidates.append((3, child, child.dep_, 'adjective')) |
| 413 | elif child.dep_ == 'compound': # Compound noun |
| 414 | candidates.append((3, child, child.dep_, 'compound')) |
| 415 | elif child.dep_ == 'prep': # Preposition |
| 416 | # Get the object of the preposition |
| 417 | for pchild in child.children: |
| 418 | if pchild.dep_ == 'pobj': |
| 419 | candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj')) |
| 420 | elif child.dep_ in ['poss', 'nmod']: # Possessive or nominal modifier |
| 421 | candidates.append((2, child, child.dep_, 'modifier')) |
| 422 | |
| 423 | # Also check if this noun is dependent on something important |
| 424 | if token.head.pos_ == 'VERB': |
| 425 | if token.dep_ in ['nsubj', 'nsubjpass', 'dobj', 'attr']: |
| 426 | candidates.append((1, token.head, token.dep_, 'verb_relation')) |
| 427 | elif token.head.pos_ == 'ADP': # Preposition |
| 428 | # Find what the preposition connects to |
| 429 | if token.head.head.pos_ in ['NOUN', 'PROPN', 'VERB']: |
| 430 | candidates.append((1, token.head.head, f'via_{token.head.text}', 'prep_head')) |
| 431 | |
| 432 | # Sort by priority (higher first) and take top 2 |
| 433 | candidates.sort(key=lambda x: x[0], reverse=True) |
| 434 | for priority, related_token, relation, rel_type in candidates[:2]: |
| 435 | keyword_info['relations'].append({ |
| 436 | 'word': related_token.text, |
| 437 | 'lemma': related_token.lemma_, |
| 438 | 'relation': relation, |
| 439 | 'type': rel_type, |
| 440 | 'pos': related_token.pos_ |
| 441 | }) |
| 442 | |
| 443 | keywords_data.append(keyword_info) |
| 444 | |
| 445 | if keywords_data: |
| 446 | sentence_keywords.append({ |
| 447 | 'sentence': sent.text, |
| 448 | 'keywords': keywords_data |
| 449 | }) |
| 450 | |
| 451 | return sentence_keywords |
| 452 | |
| 453 | def filter_keywords_by_stopwords(keyword_relations): |
| 454 | """ |
| 455 | Filter out common/generic keywords using an expanded stop word list. |
| 456 | Always keeps proper nouns (PROPN). |
| 457 | |
| 458 | Args: |
| 459 | keyword_relations: List of sentence dicts with keywords |
| 460 | |
| 461 | Returns: |
| 462 | Filtered keyword_relations without generic keywords |
| 463 | """ |
| 464 | # Common verbs to exclude |
| 465 | stop_verbs = { |
| 466 | 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see', |
| 467 | 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask', |
| 468 | 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show', |
| 469 | 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run', |
| 470 | 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write', |
| 471 | 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue', |
| 472 | 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop', |
| 473 | 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk', |
| 474 | 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve', |
| 475 | 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill', |
| 476 | 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide' |
| 477 | } |
| 478 | |
| 479 | # Common nouns to exclude |
| 480 | stop_nouns = { |
| 481 | 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman', |
| 482 | 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company', |
| 483 | 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home', |
| 484 | 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot', |
| 485 | 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind', |
| 486 | 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game', |
| 487 | 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president', |
| 488 | 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face', |
| 489 | 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history', |
| 490 | 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy', |
| 491 | 'moment', 'air', 'teacher', 'force', 'education' |
| 492 | } |
| 493 | |
| 494 | filtered_relations = [] |
| 495 | for sent_info in keyword_relations: |
| 496 | filtered_keywords = [] |
| 497 | for kw in sent_info['keywords']: |
| 498 | kept = False |
| 499 | # Always keep proper nouns |
| 500 | if kw['pos'] == 'PROPN': |
| 501 | filtered_keywords.append(kw) |
| 502 | kept = True |
| 503 | # Filter verbs against stop list |
| 504 | elif kw['pos'] == 'VERB' and kw['lemma'].lower() not in stop_verbs: |
| 505 | filtered_keywords.append(kw) |
| 506 | kept = True |
| 507 | # Filter nouns against stop list |
| 508 | elif kw['pos'] == 'NOUN' and kw['lemma'].lower() not in stop_nouns: |
| 509 | filtered_keywords.append(kw) |
| 510 | kept = True |
| 511 | |
| 512 | if filtered_keywords: |
| 513 | filtered_relations.append({ |
| 514 | 'sentence': sent_info['sentence'], |
| 515 | 'keywords': filtered_keywords |
| 516 | }) |
| 517 | |
| 518 | return filtered_relations |
| 519 | |
| 520 | def extract_sentence_word_lists(filtered_keyword_relations): |
| 521 | """ |
| 522 | Extract flat lists of words per sentence from filtered keywords. |
| 523 | Returns lemmatized keywords + their related words, filtered to remove pronouns and stop words. |
| 524 | |
| 525 | Args: |
| 526 | filtered_keyword_relations: Filtered list of sentence dicts with keywords |
| 527 | |
| 528 | Returns: |
| 529 | List of dicts with 'sentence' and 'words' (list of unique lemmas) |
| 530 | """ |
| 531 | # Common verbs to exclude |
| 532 | stop_verbs = { |
| 533 | 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see', |
| 534 | 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask', |
| 535 | 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show', |
| 536 | 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run', |
| 537 | 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write', |
| 538 | 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue', |
| 539 | 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop', |
| 540 | 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk', |
| 541 | 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve', |
| 542 | 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill', |
| 543 | 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide' |
| 544 | } |
| 545 | |
| 546 | # Common nouns to exclude |
| 547 | stop_nouns = { |
| 548 | 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman', |
| 549 | 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company', |
| 550 | 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home', |
| 551 | 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot', |
| 552 | 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind', |
| 553 | 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game', |
| 554 | 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president', |
| 555 | 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face', |
| 556 | 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history', |
| 557 | 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy', |
| 558 | 'moment', 'air', 'teacher', 'force', 'education' |
| 559 | } |
| 560 | |
| 561 | # Pronouns to exclude (lemmatized forms) |
| 562 | pronouns = { |
| 563 | 'i', 'you', 'he', 'she', 'it', 'we', 'they', |
| 564 | 'me', 'him', 'her', 'us', 'them', |
| 565 | 'my', 'your', 'his', 'her', 'its', 'our', 'their', |
| 566 | 'mine', 'yours', 'hers', 'ours', 'theirs', |
| 567 | 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', |
| 568 | 'this', 'that', 'these', 'those', |
| 569 | 'who', 'whom', 'whose', 'which', 'what', |
| 570 | 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything', |
| 571 | 'nobody', 'nothing', 'somebody', 'someone', 'something' |
| 572 | } |
| 573 | |
| 574 | sentence_lists = [] |
| 575 | |
| 576 | for sent_info in filtered_keyword_relations: |
| 577 | words = set() |
| 578 | |
| 579 | for kw in sent_info['keywords']: |
| 580 | lemma_lower = kw['lemma'].lower() |
| 581 | pos = kw['pos'] |
| 582 | |
| 583 | # Skip pronouns |
| 584 | if pos == 'PRON' or lemma_lower in pronouns: |
| 585 | continue |
| 586 | |
| 587 | # Skip stop words (but keep PROPN) |
| 588 | if pos == 'PROPN': |
| 589 | words.add(kw['lemma']) |
| 590 | elif pos == 'VERB' and lemma_lower not in stop_verbs: |
| 591 | words.add(kw['lemma']) |
| 592 | elif pos == 'NOUN' and lemma_lower not in stop_nouns: |
| 593 | words.add(kw['lemma']) |
| 594 | |
| 595 | # Add filtered related words |
| 596 | for rel in kw['relations']: |
| 597 | rel_lemma_lower = rel['lemma'].lower() |
| 598 | rel_pos = rel['pos'] |
| 599 | |
| 600 | # Skip pronouns |
| 601 | if rel_pos == 'PRON' or rel_lemma_lower in pronouns: |
| 602 | continue |
| 603 | |
| 604 | # Skip stop words (but keep PROPN) |
| 605 | if rel_pos == 'PROPN': |
| 606 | words.add(rel['lemma']) |
| 607 | elif rel_pos == 'VERB' and rel_lemma_lower not in stop_verbs: |
| 608 | words.add(rel['lemma']) |
| 609 | elif rel_pos == 'NOUN' and rel_lemma_lower not in stop_nouns: |
| 610 | words.add(rel['lemma']) |
| 611 | # Also keep adjectives and adverbs |
| 612 | elif rel_pos in ['ADJ', 'ADV']: |
| 613 | words.add(rel['lemma']) |
| 614 | |
| 615 | sentence_lists.append({ |
| 616 | 'sentence': sent_info['sentence'], |
| 617 | 'words': sorted(list(words)) # Sort for consistent output |
| 618 | }) |
| 619 | |
| 620 | return sentence_lists |
| 621 | |
| 622 | def is_common_phrase(phrase): |
| 623 | """Check if phrase contains only common words (no proper nouns or unusual words).""" |
| 624 | doc = nlp(phrase) |
| 625 | for token in doc: |
| 626 | # Skip punctuation |
| 627 | if token.is_punct: |
| 628 | continue |
| 629 | # Check if it's a proper noun |
| 630 | if token.pos_ == "PROPN": |
| 631 | return False |
| 632 | # Check if it's an unusual word (not in common vocabulary) |
| 633 | # Use combination of: not a stop word, and has low frequency rank |
| 634 | if not token.is_stop and token.is_alpha: |
| 635 | # If word has very high rank (rare) or no rank info, consider it unusual |
| 636 | if token.rank > 10000 or token.rank == 0: |
| 637 | return False |
| 638 | return True |
| 639 | |
| 640 | def print_keywords(keywords, time_taken, file, extractor_name="RAKE", lemmatized_data=None): |
| 641 | """Write extracted keywords to file.""" |
| 642 | if keywords: |
| 643 | file.write(f"{extractor_name} Keywords:\n") |
| 644 | |
| 645 | if lemmatized_data: |
| 646 | # Print with lemmatized forms |
| 647 | for score, original, lemmatized in lemmatized_data: |
| 648 | lemma_suffix = f" → {lemmatized}" if lemmatized != original.lower() else "" |
| 649 | file.write(f" - {original} (score: {score:.2f}){lemma_suffix}\n") |
| 650 | else: |
| 651 | # Print without lemmatized forms |
| 652 | for score, word in keywords: |
| 653 | file.write(f" - {word} (score: {score:.2f})\n") |
| 654 | |
| 655 | file.write(f"Total keywords: {len(keywords)} extracted in {time_taken:.4f} seconds\n") |
| 656 | else: |
| 657 | file.write(f"No {extractor_name} keywords extracted\n") |
| 658 | |
| 659 | def print_named_entities(entities, file): |
| 660 | """Write extracted named entities to file.""" |
| 661 | if entities: |
| 662 | file.write("\nNamed Entities:\n") |
| 663 | for entity_text, entity_type in entities: |
| 664 | file.write(f" - {entity_text} ({entity_type})\n") |
| 665 | file.write(f"Total named entities: {len(entities)}\n") |
| 666 | else: |
| 667 | file.write("\nNo named entities extracted\n") |
| 668 | |
| 669 | def print_phrases(noun_phrases, verb_phrases, file): |
| 670 | """Write extracted phrases to file.""" |
| 671 | file.write("\nNoun Phrases:\n") |
| 672 | if noun_phrases: |
| 673 | for phrase in noun_phrases: |
| 674 | file.write(f" - {phrase}\n") |
| 675 | file.write(f"Total noun phrases: {len(noun_phrases)}\n") |
| 676 | else: |
| 677 | file.write(" No noun phrases extracted\n") |
| 678 | |
| 679 | file.write("\nVerb Phrases:\n") |
| 680 | if verb_phrases: |
| 681 | for phrase in verb_phrases: |
| 682 | file.write(f" - {phrase}\n") |
| 683 | file.write(f"Total verb phrases: {len(verb_phrases)}\n") |
| 684 | else: |
| 685 | file.write(" No verb phrases extracted\n") |
| 686 | |
| 687 | def print_dependencies(sentences_deps, time_taken, file): |
| 688 | """Write dependency relations to file.""" |
| 689 | file.write(f"\nDependency Relations (extracted in {time_taken:.4f}sec):\n") |
| 690 | if sentences_deps: |
| 691 | for i, sent_info in enumerate(sentences_deps, 1): |
| 692 | file.write(f"\n Sentence {i}: {sent_info['sentence']}\n") |
| 693 | |
| 694 | # Print all dependencies first |
| 695 | file.write(" Dependencies:\n") |
| 696 | for dep in sent_info['dependencies']: |
| 697 | file.write(f" {dep['token']} ({dep['pos']}) --[{dep['dep']}]--> {dep['head']} ({dep['head_pos']})\n") |
| 698 | |
| 699 | # Print verb relations (verb with its noun and adverb dependencies) |
| 700 | if sent_info.get('verb_relations'): |
| 701 | file.write(" Verb Relations:\n") |
| 702 | for vr in sent_info['verb_relations']: |
| 703 | nouns_str = ', '.join(vr['nouns']) if vr['nouns'] else 'none' |
| 704 | advs_str = ', '.join(vr['advs']) if vr['advs'] else 'none' |
| 705 | file.write(f" VERB: {vr['verb']} | NOUNs: {nouns_str} | ADVs: {advs_str}\n") |
| 706 | |
| 707 | # Print independent nouns (not dependent on any verb) |
| 708 | if sent_info.get('independent_nouns'): |
| 709 | file.write(" Independent NOUNs (not verb-dependent):\n") |
| 710 | for noun_info in sent_info['independent_nouns']: |
| 711 | file.write(f" {noun_info['noun']} --[{noun_info['dep']}]--> {noun_info['head']} ({noun_info['head_pos']})") |
| 712 | # If noun depends on preposition, show what preposition depends on |
| 713 | if 'prep_head' in noun_info: |
| 714 | file.write(f" --[{noun_info['prep_dep']}]--> {noun_info['prep_head']} ({noun_info['prep_head_pos']})") |
| 715 | file.write("\n") |
| 716 | |
| 717 | file.write(f"\nTotal sentences: {len(sentences_deps)}\n") |
| 718 | else: |
| 719 | file.write(" No dependencies extracted\n") |
| 720 | |
| 721 | def print_keyword_relationships(keyword_relations, file, header="Keyword Relationships (All NOUNs and VERBs)"): |
| 722 | """Write keyword relationship analysis to file.""" |
| 723 | file.write(f"\n{header}:\n") |
| 724 | |
| 725 | if not keyword_relations: |
| 726 | file.write(" No keywords found\n") |
| 727 | return |
| 728 | |
| 729 | for sent_info in keyword_relations: |
| 730 | file.write(f"\n Sentence: {sent_info['sentence']}\n") |
| 731 | file.write(f" Keywords with relations:\n") |
| 732 | |
| 733 | for kw in sent_info['keywords']: |
| 734 | related_words = [] |
| 735 | for rel in kw['relations']: |
| 736 | related_words.append(f"{rel['lemma']}({rel['type']})") |
| 737 | |
| 738 | related_str = ', '.join(related_words) if related_words else 'none' |
| 739 | file.write(f" {kw['lemma']} [{kw['pos']}]: {related_str}\n") |
| 740 | |
| 741 | count = 0 |
| 742 | rake_total_time = 0 |
| 743 | yake_total_time = 0 |
| 744 | keybert_total_time = 0 |
| 745 | dep_total_time = 0 |
| 746 | total_keyword_density = 0.0 |
| 747 | all_keywords = set() # Track unique keywords across all messages |
| 748 | total_words = 0 # Track total words across all messages |
| 749 | |
| 750 | with open(output_file, 'w', encoding='utf-8') as f: |
| 751 | for item in rawData: |
| 752 | # Get message text |
| 753 | message = item['speaker'] + ": " + item['content'] |
| 754 | |
| 755 | # Time RAKE extraction |
| 756 | rake_start = time.time() |
| 757 | rake_keywords = extract_keywords_rake(message) |
| 758 | rake_end = time.time() |
| 759 | rake_time = rake_end - rake_start |
| 760 | rake_total_time += rake_time |
| 761 | |
| 762 | # Time YAKE extraction |
| 763 | yake_start = time.time() |
| 764 | yake_keywords = extract_keywords_yake(message) |
| 765 | yake_end = time.time() |
| 766 | yake_time = yake_end - yake_start |
| 767 | yake_total_time += yake_time |
| 768 | |
| 769 | # Time KeyBERT extraction |
| 770 | keybert_start = time.time() |
| 771 | keybert_keywords = extract_keywords_keybert(message) |
| 772 | keybert_end = time.time() |
| 773 | keybert_time = keybert_end - keybert_start |
| 774 | keybert_total_time += keybert_time |
| 775 | |
| 776 | # Time dependency extraction (also returns doc for analysis) |
| 777 | dep_start = time.time() |
| 778 | dependencies, spacy_doc = extract_dependencies(message) |
| 779 | dep_end = time.time() |
| 780 | dep_time = dep_end - dep_start |
| 781 | dep_total_time += dep_time |
| 782 | |
| 783 | # Lemmatize RAKE keyphrases using the already-computed spacy doc |
| 784 | rake_lemmatized = lemmatize_rake_keyphrases_from_doc(rake_keywords, spacy_doc) |
| 785 | |
| 786 | # Lemmatize YAKE keyphrases using the already-computed spacy doc |
| 787 | yake_lemmatized = lemmatize_yake_keyphrases_from_doc(yake_keywords, spacy_doc) |
| 788 | |
| 789 | # Analyze RAKE keyphrase relationships using the spacy doc |
| 790 | rake_relationships = analyze_rake_phrase_relationships(rake_keywords, spacy_doc) |
| 791 | |
| 792 | # Analyze YAKE keyphrase relationships using the spacy doc |
| 793 | yake_relationships = analyze_yake_phrase_relationships(yake_keywords, spacy_doc) |
| 794 | |
| 795 | # Find keyword relationships for indexing |
| 796 | keyword_relationships = extract_keyword_relations(spacy_doc, message) |
| 797 | |
| 798 | # Filter keywords using stop words |
| 799 | filtered_keyword_relationships = filter_keywords_by_stopwords(keyword_relationships) |
| 800 | |
| 801 | # Extract named entities |
| 802 | # entities = extract_named_entities(message) |
| 803 | |
| 804 | # Extract noun and verb phrases |
| 805 | #noun_phrases, verb_phrases = extract_phrases(message) |
| 806 | |
| 807 | # Write original message |
| 808 | f.write(f"\n{'='*80}\n") |
| 809 | f.write(f"Message {count + 1}:\n") |
| 810 | f.write(message + "\n") |
| 811 | f.write('-' * 80 + "\n") |
| 812 | |
| 813 | if verbose: |
| 814 | # Write RAKE keywords with lemmatization |
| 815 | print_keywords(rake_keywords, rake_time, f, "RAKE", lemmatized_data=rake_lemmatized) |
| 816 | |
| 817 | # Write YAKE keywords with lemmatization |
| 818 | f.write("\n") |
| 819 | print_keywords(yake_keywords, yake_time, f, "YAKE", lemmatized_data=yake_lemmatized) |
| 820 | |
| 821 | # Write KeyBERT keywords |
| 822 | f.write("\n") |
| 823 | print_keywords(keybert_keywords, keybert_time, f, "KeyBERT") |
| 824 | |
| 825 | # Write dependencies |
| 826 | print_dependencies(dependencies, dep_time, f) |
| 827 | |
| 828 | # Write keyword relationships for indexing (unfiltered) |
| 829 | print_keyword_relationships(keyword_relationships, f) |
| 830 | |
| 831 | # Write filtered keyword relationships for indexing |
| 832 | print_keyword_relationships(filtered_keyword_relationships, f, header="Filtered Keywords (Stop Words Removed)") |
| 833 | |
| 834 | # Extract and print flat word lists per sentence |
| 835 | sentence_word_lists = extract_sentence_word_lists(filtered_keyword_relationships) |
| 836 | f.write(f"\nSentence Word Lists (Keywords + Relations):\n") |
| 837 | for sent_list in sentence_word_lists: |
| 838 | f.write(f" {sent_list['words']}\n") |
| 839 | |
| 840 | # Calculate and print word statistics |
| 841 | # Count unique words across all sentence lists |
| 842 | all_keywords_this_message = set() |
| 843 | for sent_list in sentence_word_lists: |
| 844 | all_keywords_this_message.update(sent_list['words']) |
| 845 | |
| 846 | # Update global tracking |
| 847 | all_keywords.update(all_keywords_this_message) |
| 848 | |
| 849 | # Count total words in original message |
| 850 | message_word_count = len(message.split()) |
| 851 | total_words += message_word_count |
| 852 | unique_keyword_count = len(all_keywords_this_message) |
| 853 | |
| 854 | f.write(f"\nWord Statistics:\n") |
| 855 | f.write(f" Total words in message: {message_word_count}\n") |
| 856 | f.write(f" Unique filtered keywords: {unique_keyword_count}\n") |
| 857 | if message_word_count > 0: |
| 858 | percentage = (unique_keyword_count / message_word_count) * 100 |
| 859 | f.write(f" Keyword density: {percentage:.1f}%\n") |
| 860 | total_keyword_density += percentage |
| 861 | |
| 862 | # Write timing comparison (only if verbose) |
| 863 | if verbose: |
| 864 | f.write(f"\nTiming Comparison:\n") |
| 865 | f.write(f" RAKE: {rake_time:.4f}sec\n") |
| 866 | f.write(f" YAKE: {yake_time:.4f}sec\n") |
| 867 | f.write(f" KeyBERT: {keybert_time:.4f}sec\n") |
| 868 | f.write(f" Dependencies: {dep_time:.4f}sec\n") |
| 869 | times = {'RAKE': rake_time, 'YAKE': yake_time, 'KeyBERT': keybert_time, 'Dependencies': dep_time} |
| 870 | fastest = min(times, key=times.get) |
| 871 | f.write(f" Fastest: {fastest}\n") |
| 872 | |
| 873 | # Write named entities |
| 874 | #print_named_entities(entities, f) |
| 875 | |
| 876 | # Write phrases |
| 877 | #print_phrases(noun_phrases, verb_phrases, f) |
| 878 | |
| 879 | count += 1 |
| 880 | |
| 881 | # Print progress indicator every 50 messages |
| 882 | if count % 50 == 0: |
| 883 | print(f"Progress: Processed {count} messages...") |
| 884 | |
| 885 | # Write overall timing summary |
| 886 | f.write(f"\n\n{'='*80}\n") |
| 887 | f.write(f"OVERALL TIMING SUMMARY ({count} messages):\n") |
| 888 | f.write(f"{'='*80}\n") |
| 889 | f.write(f"Total RAKE time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec per message)\n") |
| 890 | f.write(f"Total YAKE time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec per message)\n") |
| 891 | f.write(f"Total KeyBERT time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec per message)\n") |
| 892 | f.write(f"Total Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec per message)\n") |
| 893 | |
| 894 | avg_keyword_density = total_keyword_density / count if count > 0 else 0 |
| 895 | f.write(f"\nAverage keyword density: {avg_keyword_density:.1f}%\n") |
| 896 | f.write(f"\nTotal words across all messages: {total_words}\n") |
| 897 | f.write(f"Unique keywords across all messages: {len(all_keywords)}\n") |
| 898 | if total_words > 0: |
| 899 | overall_density = (len(all_keywords) / total_words) * 100 |
| 900 | f.write(f"Overall keyword density: {overall_density:.1f}%\n") |
| 901 | |
| 902 | total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time} |
| 903 | fastest = min(total_times, key=total_times.get) |
| 904 | slowest = max(total_times, key=total_times.get) |
| 905 | f.write(f"\nOverall fastest: {fastest}\n") |
| 906 | f.write(f"Overall slowest: {slowest}\n") |
| 907 | speedup = total_times[slowest] / total_times[fastest] |
| 908 | f.write(f"Speedup factor (fastest vs slowest): {speedup:.2f}x\n") |
| 909 | |
| 910 | print(f"Extraction complete. Results written to {output_file}") |
| 911 | print(f"Processed {count} messages") |
| 912 | print(f"RAKE total time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec)") |
| 913 | print(f"YAKE total time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec)") |
| 914 | print(f"KeyBERT total time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec)") |
| 915 | print(f"Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec)") |
| 916 | total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time} |
| 917 | fastest = min(total_times, key=total_times.get) |
| 918 | print(f"Overall fastest: {fastest}") |
| 919 | |