microsoft/TypeAgent

Public

mirrored from https://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
ac6c97707d9fb5efeb7846917c307cd7057a2190

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/fineTuning/unsloth/nltkExtract.py

905lines · modecode

1# Copyright (c) Microsoft Corporation and Henry Lucco.
2# Licensed under the MIT License.
3
4from rake_nltk import Rake
5import sys
6import os
7from argparse import ArgumentParser
8import nltk
9from nltk import word_tokenize, pos_tag, ne_chunk
10from nltk.tree import Tree
11import spacy
12import yake
13from keybert import KeyBERT
14
15parser = ArgumentParser(description="Extract keywords from dataset using NLTK-RAKE.")
16parser.add_argument("--dataset_path", type=str, default='/data/npr_chunks_no_embedding_seed127_samples5000_test.json', help="Path to the dataset file.")
17parser.add_argument("--max_length", type=int, default=1, help="Maximum number of words in a keyword phrase.")
18parser.add_argument("--output_file", type=str, default='extraction.txt', help="Path to the output file.")
19parser.add_argument("--verbose", action='store_true', help="Enable verbose output (shows all extraction details). Default is non-verbose.")
20args = parser.parse_args(sys.argv[1:])
21dataset_path = args.dataset_path
22max_length = args.max_length
23output_file = args.output_file
24verbose = args.verbose
25
26# Initialize RAKE with max_length configuration
27rake = Rake(max_length=max_length)
28
29# Initialize YAKE keyword extractor
30# Parameters: language, max_ngram_size, deduplication_threshold, number of keywords
31yake_extractor = yake.KeywordExtractor(lan="en", n=max_length, dedupLim=0.9, top=20)
32
33# Initialize KeyBERT model
34keybert_model = KeyBERT()
35
36# Load spacy model with only necessary components
37# Keep: tok2vec, tagger, parser, lemmatizer
38# Disable: ner, attribute_ruler, and any other unused components
39nlp = spacy.load("en_core_web_sm", disable=["ner"])
40
41# Load an array of JSON objects with properties like speaker and content
42import json
43import time
44
45with open(dataset_path) as f:
46 rawData = json.load(f)
47
48def extract_keywords_rake(text):
49 """Extract keywords from text using RAKE."""
50 rake.extract_keywords_from_text(text)
51
52 # Get ranked phrases with scores (already limited by max_length configuration)
53 keywords = rake.get_ranked_phrases_with_scores()
54
55 return keywords
56
57def extract_keywords_yake(text):
58 """Extract keywords from text using YAKE."""
59 # YAKE returns (keyword, score) where lower scores are better
60 # Reverse to (score, keyword) to match RAKE format
61 keywords = [(score, keyword) for keyword, score in yake_extractor.extract_keywords(text)]
62
63 return keywords
64
65def extract_keywords_keybert(text):
66 """Extract keywords from text using KeyBERT."""
67 # KeyBERT returns (keyword, score) where higher scores are better
68 # Reverse to (score, keyword) to match RAKE format
69 keywords = keybert_model.extract_keywords(text, keyphrase_ngram_range=(1, max_length), top_n=20)
70 keywords = [(score, keyword) for keyword, score in keywords]
71
72 return keywords
73
74def extract_named_entities(text):
75 """Extract named entities from text using NLTK."""
76 # Tokenize and tag parts of speech
77 tokens = word_tokenize(text)
78 pos_tags = pos_tag(tokens)
79
80 # Extract named entities
81 named_entities = ne_chunk(pos_tags, binary=False)
82
83 # Parse the tree to extract entities with their types
84 entities = []
85 for chunk in named_entities:
86 if isinstance(chunk, Tree):
87 entity_type = chunk.label()
88 entity_text = " ".join([token for token, pos in chunk.leaves()])
89 entities.append((entity_text, entity_type))
90
91 return entities
92
93def extract_phrases(text):
94 """Extract noun phrases and verb phrases from text using spacy."""
95 doc = nlp(text)
96
97 # Extract noun phrases
98 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
99
100 # Extract verb phrases (tokens with verb POS and their dependents)
101 verb_phrases = []
102 for token in doc:
103 if token.pos_ == "VERB":
104 # Get the verb and its direct object/complement
105 phrase_tokens = [token.text]
106 for child in token.children:
107 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
108 phrase_tokens.append(child.text)
109 if len(phrase_tokens) > 1:
110 verb_phrases.append(" ".join(phrase_tokens))
111 else:
112 verb_phrases.append(token.text)
113
114 return noun_phrases, verb_phrases
115
116def extract_dependencies(text):
117 """Extract dependency relations from text using spacy with sentence boundaries."""
118 doc = nlp(text)
119
120 # Process each sentence separately for better dependency analysis
121 sentences_deps = []
122 for sent in doc.sents:
123 sent_deps = []
124 verb_relations = []
125 nouns_in_verb_relations = set()
126
127 for token in sent:
128 # Extract: token, POS tag, dependency relation, head token
129 dep_info = {
130 'token': token.text,
131 'pos': token.pos_,
132 'dep': token.dep_,
133 'head': token.head.text,
134 'head_pos': token.head.pos_
135 }
136 sent_deps.append(dep_info)
137
138 # If token is a verb, extract its NOUN and ADV dependencies
139 if token.pos_ == "VERB":
140 nouns = []
141 advs = []
142 for child in token.children:
143 if child.pos_ == "NOUN" or child.pos_ == "PROPN":
144 nouns.append(child.text)
145 nouns_in_verb_relations.add(child.text)
146 elif child.pos_ == "ADV":
147 advs.append(child.text)
148
149 verb_relations.append({
150 'verb': token.text,
151 'nouns': nouns,
152 'advs': advs
153 })
154
155 # Find nouns that are NOT dependent on any verb
156 independent_nouns = []
157 for token in sent:
158 if (token.pos_ == "NOUN" or token.pos_ == "PROPN") and token.text not in nouns_in_verb_relations:
159 noun_info = {
160 'noun': token.text,
161 'dep': token.dep_,
162 'head': token.head.text,
163 'head_pos': token.head.pos_
164 }
165
166 # If the noun depends on a preposition, also show what the preposition depends on
167 if token.head.pos_ == "ADP":
168 noun_info['prep_head'] = token.head.head.text
169 noun_info['prep_head_pos'] = token.head.head.pos_
170 noun_info['prep_dep'] = token.head.dep_
171
172 independent_nouns.append(noun_info)
173
174 sentences_deps.append({
175 'sentence': sent.text,
176 'dependencies': sent_deps,
177 'verb_relations': verb_relations,
178 'independent_nouns': independent_nouns
179 })
180
181 return sentences_deps, doc
182
183def lemmatize_rake_keyphrases_from_doc(rake_keywords, doc):
184 """Lemmatize RAKE keyphrases using existing spacy doc by finding token offsets."""
185 lemmatized = []
186
187 for score, keyphrase in rake_keywords:
188 keyphrase_lower = keyphrase.lower()
189 keyphrase_tokens = keyphrase_lower.split()
190
191 # Try to find matching sequence of tokens in doc
192 matched_tokens = []
193 doc_tokens = [t for t in doc if not t.is_punct]
194
195 for i in range(len(doc_tokens)):
196 # Check if we have a match starting at position i
197 temp_tokens = []
198
199 for j, kp_token in enumerate(keyphrase_tokens):
200 if i + j < len(doc_tokens):
201 if doc_tokens[i + j].text.lower() == kp_token:
202 temp_tokens.append(doc_tokens[i + j])
203 else:
204 temp_tokens.clear()
205 break
206 else:
207 temp_tokens.clear()
208 break
209
210 if len(temp_tokens) == len(keyphrase_tokens):
211 matched_tokens = temp_tokens
212 break
213
214 # Get lemmas from matched tokens
215 if matched_tokens:
216 lemmas = [token.lemma_ for token in matched_tokens]
217 lemmatized_phrase = " ".join(lemmas)
218 else:
219 # If no match found, just keep the original lowercased
220 lemmatized_phrase = keyphrase_lower
221
222 lemmatized.append((score, keyphrase, lemmatized_phrase))
223
224 return lemmatized
225
226def lemmatize_yake_keyphrases_from_doc(yake_keywords, doc):
227 """Lemmatize YAKE keyphrases using existing spacy doc by finding token offsets."""
228 lemmatized = []
229
230 for score, keyphrase in yake_keywords:
231 keyphrase_lower = keyphrase.lower()
232 keyphrase_tokens = keyphrase_lower.split()
233
234 # Try to find matching sequence of tokens in doc
235 matched_tokens = []
236 doc_tokens = [t for t in doc if not t.is_punct]
237
238 for i in range(len(doc_tokens)):
239 # Check if we have a match starting at position i
240 temp_tokens = []
241
242 for j, kp_token in enumerate(keyphrase_tokens):
243 if i + j < len(doc_tokens):
244 if doc_tokens[i + j].text.lower() == kp_token:
245 temp_tokens.append(doc_tokens[i + j])
246 else:
247 temp_tokens.clear()
248 break
249 else:
250 temp_tokens.clear()
251 break
252
253 if len(temp_tokens) == len(keyphrase_tokens):
254 matched_tokens = temp_tokens
255 break
256
257 # Get lemmas from matched tokens
258 if matched_tokens:
259 lemmas = [token.lemma_ for token in matched_tokens]
260 lemmatized_phrase = " ".join(lemmas)
261 else:
262 # If no match found, just keep the original lowercased
263 lemmatized_phrase = keyphrase_lower
264
265 lemmatized.append((score, keyphrase, lemmatized_phrase))
266
267 return lemmatized
268
269def analyze_rake_phrase_relationships(rake_keywords, doc):
270 """Analyze relationships between RAKE keyphrases using spacy doc."""
271 # Extract just the keyword text (not scores)
272 rake_phrases = [keyword.lower() for score, keyword in rake_keywords]
273
274 # Get noun and verb phrases from doc
275 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
276
277 verb_phrases = []
278 for token in doc:
279 if token.pos_ == "VERB":
280 phrase_tokens = [token.text]
281 for child in token.children:
282 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
283 phrase_tokens.append(child.text)
284 if len(phrase_tokens) > 1:
285 verb_phrases.append(" ".join(phrase_tokens))
286
287 # Find phrases that contain 2+ RAKE keyphrases
288 relationships = []
289
290 for np in noun_phrases:
291 np_lower = np.lower()
292 matching_keyphrases = [kp for kp in rake_phrases if kp in np_lower]
293 if len(matching_keyphrases) >= 2:
294 relationships.append({
295 'type': 'noun phrase',
296 'phrase': np,
297 'keyphrases': matching_keyphrases
298 })
299
300 for vp in verb_phrases:
301 vp_lower = vp.lower()
302 matching_keyphrases = [kp for kp in rake_phrases if kp in vp_lower]
303 if len(matching_keyphrases) >= 2:
304 relationships.append({
305 'type': 'verb phrase',
306 'phrase': vp,
307 'keyphrases': matching_keyphrases
308 })
309
310 return relationships
311
312def analyze_yake_phrase_relationships(yake_keywords, doc):
313 """Analyze relationships between YAKE keyphrases using spacy doc."""
314 # Extract just the keyword text (not scores)
315 yake_phrases = [keyword.lower() for score, keyword in yake_keywords]
316
317 # Get noun and verb phrases from doc
318 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
319
320 verb_phrases = []
321 for token in doc:
322 if token.pos_ == "VERB":
323 phrase_tokens = [token.text]
324 for child in token.children:
325 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
326 phrase_tokens.append(child.text)
327 if len(phrase_tokens) > 1:
328 verb_phrases.append(" ".join(phrase_tokens))
329
330 # Find phrases that contain 2+ YAKE keyphrases
331 relationships = []
332
333 for np in noun_phrases:
334 np_lower = np.lower()
335 matching_keyphrases = [kp for kp in yake_phrases if kp in np_lower]
336 if len(matching_keyphrases) >= 2:
337 relationships.append({
338 'type': 'noun phrase',
339 'phrase': np,
340 'keyphrases': matching_keyphrases
341 })
342
343 for vp in verb_phrases:
344 vp_lower = vp.lower()
345 matching_keyphrases = [kp for kp in yake_phrases if kp in vp_lower]
346 if len(matching_keyphrases) >= 2:
347 relationships.append({
348 'type': 'verb phrase',
349 'phrase': vp,
350 'keyphrases': matching_keyphrases
351 })
352
353 return relationships
354
355def extract_keyword_relations(doc, message):
356 """Extract all NOUNS and VERBS as keywords, with up to 2 most important related words each."""
357 # Process each sentence separately
358 sentence_keywords = []
359
360 for sent in doc.sents:
361 keywords_data = []
362
363 for token in sent:
364 # Only process NOUNS, PROPN, and VERBS
365 if token.pos_ not in ['NOUN', 'PROPN', 'VERB']:
366 continue
367
368 keyword_info = {
369 'keyword': token.text,
370 'pos': token.pos_,
371 'lemma': token.lemma_,
372 'relations': []
373 }
374
375 # Collect potential related words with priority scores
376 candidates = []
377
378 if token.pos_ == 'VERB':
379 # For verbs, prioritize: subject, object, adverbs, prepositional objects
380 for child in token.children:
381 if child.dep_ in ['nsubj', 'nsubjpass']: # Subject
382 candidates.append((3, child, child.dep_, 'subject'))
383 elif child.dep_ in ['dobj', 'attr']: # Direct object
384 candidates.append((3, child, child.dep_, 'object'))
385 elif child.dep_ == 'advmod': # Adverb
386 candidates.append((2, child, child.dep_, 'adverb'))
387 elif child.dep_ == 'prep': # Preposition
388 # Get the object of the preposition
389 for pchild in child.children:
390 if pchild.dep_ == 'pobj':
391 candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj'))
392 elif child.dep_ in ['iobj', 'dative']: # Indirect object
393 candidates.append((2, child, child.dep_, 'indirect_obj'))
394
395 elif token.pos_ in ['NOUN', 'PROPN']:
396 # For nouns, prioritize: adjectives, compound nouns, prep phrases, possessives
397 for child in token.children:
398 if child.dep_ == 'amod': # Adjective modifier
399 candidates.append((3, child, child.dep_, 'adjective'))
400 elif child.dep_ == 'compound': # Compound noun
401 candidates.append((3, child, child.dep_, 'compound'))
402 elif child.dep_ == 'prep': # Preposition
403 # Get the object of the preposition
404 for pchild in child.children:
405 if pchild.dep_ == 'pobj':
406 candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj'))
407 elif child.dep_ in ['poss', 'nmod']: # Possessive or nominal modifier
408 candidates.append((2, child, child.dep_, 'modifier'))
409
410 # Also check if this noun is dependent on something important
411 if token.head.pos_ == 'VERB':
412 if token.dep_ in ['nsubj', 'nsubjpass', 'dobj', 'attr']:
413 candidates.append((1, token.head, token.dep_, 'verb_relation'))
414 elif token.head.pos_ == 'ADP': # Preposition
415 # Find what the preposition connects to
416 if token.head.head.pos_ in ['NOUN', 'PROPN', 'VERB']:
417 candidates.append((1, token.head.head, f'via_{token.head.text}', 'prep_head'))
418
419 # Sort by priority (higher first) and take top 2
420 candidates.sort(key=lambda x: x[0], reverse=True)
421 for priority, related_token, relation, rel_type in candidates[:2]:
422 keyword_info['relations'].append({
423 'word': related_token.text,
424 'lemma': related_token.lemma_,
425 'relation': relation,
426 'type': rel_type,
427 'pos': related_token.pos_
428 })
429
430 keywords_data.append(keyword_info)
431
432 if keywords_data:
433 sentence_keywords.append({
434 'sentence': sent.text,
435 'keywords': keywords_data
436 })
437
438 return sentence_keywords
439
440def filter_keywords_by_stopwords(keyword_relations):
441 """
442 Filter out common/generic keywords using an expanded stop word list.
443 Always keeps proper nouns (PROPN).
444
445 Args:
446 keyword_relations: List of sentence dicts with keywords
447
448 Returns:
449 Filtered keyword_relations without generic keywords
450 """
451 # Common verbs to exclude
452 stop_verbs = {
453 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see',
454 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask',
455 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show',
456 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run',
457 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write',
458 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue',
459 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop',
460 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk',
461 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve',
462 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill',
463 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide'
464 }
465
466 # Common nouns to exclude
467 stop_nouns = {
468 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman',
469 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company',
470 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home',
471 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot',
472 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind',
473 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game',
474 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president',
475 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face',
476 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history',
477 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy',
478 'moment', 'air', 'teacher', 'force', 'education'
479 }
480
481 filtered_relations = []
482 for sent_info in keyword_relations:
483 filtered_keywords = []
484 for kw in sent_info['keywords']:
485 kept = False
486 # Always keep proper nouns
487 if kw['pos'] == 'PROPN':
488 filtered_keywords.append(kw)
489 kept = True
490 # Filter verbs against stop list
491 elif kw['pos'] == 'VERB' and kw['lemma'].lower() not in stop_verbs:
492 filtered_keywords.append(kw)
493 kept = True
494 # Filter nouns against stop list
495 elif kw['pos'] == 'NOUN' and kw['lemma'].lower() not in stop_nouns:
496 filtered_keywords.append(kw)
497 kept = True
498
499 if filtered_keywords:
500 filtered_relations.append({
501 'sentence': sent_info['sentence'],
502 'keywords': filtered_keywords
503 })
504
505 return filtered_relations
506
507def extract_sentence_word_lists(filtered_keyword_relations):
508 """
509 Extract flat lists of words per sentence from filtered keywords.
510 Returns lemmatized keywords + their related words, filtered to remove pronouns and stop words.
511
512 Args:
513 filtered_keyword_relations: Filtered list of sentence dicts with keywords
514
515 Returns:
516 List of dicts with 'sentence' and 'words' (list of unique lemmas)
517 """
518 # Common verbs to exclude
519 stop_verbs = {
520 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see',
521 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask',
522 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show',
523 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run',
524 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write',
525 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue',
526 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop',
527 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk',
528 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve',
529 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill',
530 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide'
531 }
532
533 # Common nouns to exclude
534 stop_nouns = {
535 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman',
536 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company',
537 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home',
538 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot',
539 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind',
540 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game',
541 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president',
542 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face',
543 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history',
544 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy',
545 'moment', 'air', 'teacher', 'force', 'education'
546 }
547
548 # Pronouns to exclude (lemmatized forms)
549 pronouns = {
550 'i', 'you', 'he', 'she', 'it', 'we', 'they',
551 'me', 'him', 'her', 'us', 'them',
552 'my', 'your', 'his', 'her', 'its', 'our', 'their',
553 'mine', 'yours', 'hers', 'ours', 'theirs',
554 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
555 'this', 'that', 'these', 'those',
556 'who', 'whom', 'whose', 'which', 'what',
557 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything',
558 'nobody', 'nothing', 'somebody', 'someone', 'something'
559 }
560
561 sentence_lists = []
562
563 for sent_info in filtered_keyword_relations:
564 words = set()
565
566 for kw in sent_info['keywords']:
567 lemma_lower = kw['lemma'].lower()
568 pos = kw['pos']
569
570 # Skip pronouns
571 if pos == 'PRON' or lemma_lower in pronouns:
572 continue
573
574 # Skip stop words (but keep PROPN)
575 if pos == 'PROPN':
576 words.add(kw['lemma'])
577 elif pos == 'VERB' and lemma_lower not in stop_verbs:
578 words.add(kw['lemma'])
579 elif pos == 'NOUN' and lemma_lower not in stop_nouns:
580 words.add(kw['lemma'])
581
582 # Add filtered related words
583 for rel in kw['relations']:
584 rel_lemma_lower = rel['lemma'].lower()
585 rel_pos = rel['pos']
586
587 # Skip pronouns
588 if rel_pos == 'PRON' or rel_lemma_lower in pronouns:
589 continue
590
591 # Skip stop words (but keep PROPN)
592 if rel_pos == 'PROPN':
593 words.add(rel['lemma'])
594 elif rel_pos == 'VERB' and rel_lemma_lower not in stop_verbs:
595 words.add(rel['lemma'])
596 elif rel_pos == 'NOUN' and rel_lemma_lower not in stop_nouns:
597 words.add(rel['lemma'])
598 # Also keep adjectives and adverbs
599 elif rel_pos in ['ADJ', 'ADV']:
600 words.add(rel['lemma'])
601
602 sentence_lists.append({
603 'sentence': sent_info['sentence'],
604 'words': sorted(list(words)) # Sort for consistent output
605 })
606
607 return sentence_lists
608
609def is_common_phrase(phrase):
610 """Check if phrase contains only common words (no proper nouns or unusual words)."""
611 doc = nlp(phrase)
612 for token in doc:
613 # Skip punctuation
614 if token.is_punct:
615 continue
616 # Check if it's a proper noun
617 if token.pos_ == "PROPN":
618 return False
619 # Check if it's an unusual word (not in common vocabulary)
620 # Use combination of: not a stop word, and has low frequency rank
621 if not token.is_stop and token.is_alpha:
622 # If word has very high rank (rare) or no rank info, consider it unusual
623 if token.rank > 10000 or token.rank == 0:
624 return False
625 return True
626
627def print_keywords(keywords, time_taken, file, extractor_name="RAKE", lemmatized_data=None):
628 """Write extracted keywords to file."""
629 if keywords:
630 file.write(f"{extractor_name} Keywords:\n")
631
632 if lemmatized_data:
633 # Print with lemmatized forms
634 for score, original, lemmatized in lemmatized_data:
635 lemma_suffix = f" → {lemmatized}" if lemmatized != original.lower() else ""
636 file.write(f" - {original} (score: {score:.2f}){lemma_suffix}\n")
637 else:
638 # Print without lemmatized forms
639 for score, word in keywords:
640 file.write(f" - {word} (score: {score:.2f})\n")
641
642 file.write(f"Total keywords: {len(keywords)} extracted in {time_taken:.4f} seconds\n")
643 else:
644 file.write(f"No {extractor_name} keywords extracted\n")
645
646def print_named_entities(entities, file):
647 """Write extracted named entities to file."""
648 if entities:
649 file.write("\nNamed Entities:\n")
650 for entity_text, entity_type in entities:
651 file.write(f" - {entity_text} ({entity_type})\n")
652 file.write(f"Total named entities: {len(entities)}\n")
653 else:
654 file.write("\nNo named entities extracted\n")
655
656def print_phrases(noun_phrases, verb_phrases, file):
657 """Write extracted phrases to file."""
658 file.write("\nNoun Phrases:\n")
659 if noun_phrases:
660 for phrase in noun_phrases:
661 file.write(f" - {phrase}\n")
662 file.write(f"Total noun phrases: {len(noun_phrases)}\n")
663 else:
664 file.write(" No noun phrases extracted\n")
665
666 file.write("\nVerb Phrases:\n")
667 if verb_phrases:
668 for phrase in verb_phrases:
669 file.write(f" - {phrase}\n")
670 file.write(f"Total verb phrases: {len(verb_phrases)}\n")
671 else:
672 file.write(" No verb phrases extracted\n")
673
674def print_dependencies(sentences_deps, time_taken, file):
675 """Write dependency relations to file."""
676 file.write(f"\nDependency Relations (extracted in {time_taken:.4f}sec):\n")
677 if sentences_deps:
678 for i, sent_info in enumerate(sentences_deps, 1):
679 file.write(f"\n Sentence {i}: {sent_info['sentence']}\n")
680
681 # Print all dependencies first
682 file.write(" Dependencies:\n")
683 for dep in sent_info['dependencies']:
684 file.write(f" {dep['token']} ({dep['pos']}) --[{dep['dep']}]--> {dep['head']} ({dep['head_pos']})\n")
685
686 # Print verb relations (verb with its noun and adverb dependencies)
687 if sent_info.get('verb_relations'):
688 file.write(" Verb Relations:\n")
689 for vr in sent_info['verb_relations']:
690 nouns_str = ', '.join(vr['nouns']) if vr['nouns'] else 'none'
691 advs_str = ', '.join(vr['advs']) if vr['advs'] else 'none'
692 file.write(f" VERB: {vr['verb']} | NOUNs: {nouns_str} | ADVs: {advs_str}\n")
693
694 # Print independent nouns (not dependent on any verb)
695 if sent_info.get('independent_nouns'):
696 file.write(" Independent NOUNs (not verb-dependent):\n")
697 for noun_info in sent_info['independent_nouns']:
698 file.write(f" {noun_info['noun']} --[{noun_info['dep']}]--> {noun_info['head']} ({noun_info['head_pos']})")
699 # If noun depends on preposition, show what preposition depends on
700 if 'prep_head' in noun_info:
701 file.write(f" --[{noun_info['prep_dep']}]--> {noun_info['prep_head']} ({noun_info['prep_head_pos']})")
702 file.write("\n")
703
704 file.write(f"\nTotal sentences: {len(sentences_deps)}\n")
705 else:
706 file.write(" No dependencies extracted\n")
707
708def print_keyword_relationships(keyword_relations, file, header="Keyword Relationships (All NOUNs and VERBs)"):
709 """Write keyword relationship analysis to file."""
710 file.write(f"\n{header}:\n")
711
712 if not keyword_relations:
713 file.write(" No keywords found\n")
714 return
715
716 for sent_info in keyword_relations:
717 file.write(f"\n Sentence: {sent_info['sentence']}\n")
718 file.write(f" Keywords with relations:\n")
719
720 for kw in sent_info['keywords']:
721 related_words = []
722 for rel in kw['relations']:
723 related_words.append(f"{rel['lemma']}({rel['type']})")
724
725 related_str = ', '.join(related_words) if related_words else 'none'
726 file.write(f" {kw['lemma']} [{kw['pos']}]: {related_str}\n")
727
728count = 0
729rake_total_time = 0
730yake_total_time = 0
731keybert_total_time = 0
732dep_total_time = 0
733total_keyword_density = 0.0
734all_keywords = set() # Track unique keywords across all messages
735total_words = 0 # Track total words across all messages
736
737with open(output_file, 'w', encoding='utf-8') as f:
738 for item in rawData:
739 # Get message text
740 message = item['speaker'] + ": " + item['content']
741
742 # Time RAKE extraction
743 rake_start = time.time()
744 rake_keywords = extract_keywords_rake(message)
745 rake_end = time.time()
746 rake_time = rake_end - rake_start
747 rake_total_time += rake_time
748
749 # Time YAKE extraction
750 yake_start = time.time()
751 yake_keywords = extract_keywords_yake(message)
752 yake_end = time.time()
753 yake_time = yake_end - yake_start
754 yake_total_time += yake_time
755
756 # Time KeyBERT extraction
757 keybert_start = time.time()
758 keybert_keywords = extract_keywords_keybert(message)
759 keybert_end = time.time()
760 keybert_time = keybert_end - keybert_start
761 keybert_total_time += keybert_time
762
763 # Time dependency extraction (also returns doc for analysis)
764 dep_start = time.time()
765 dependencies, spacy_doc = extract_dependencies(message)
766 dep_end = time.time()
767 dep_time = dep_end - dep_start
768 dep_total_time += dep_time
769
770 # Lemmatize RAKE keyphrases using the already-computed spacy doc
771 rake_lemmatized = lemmatize_rake_keyphrases_from_doc(rake_keywords, spacy_doc)
772
773 # Lemmatize YAKE keyphrases using the already-computed spacy doc
774 yake_lemmatized = lemmatize_yake_keyphrases_from_doc(yake_keywords, spacy_doc)
775
776 # Analyze RAKE keyphrase relationships using the spacy doc
777 rake_relationships = analyze_rake_phrase_relationships(rake_keywords, spacy_doc)
778
779 # Analyze YAKE keyphrase relationships using the spacy doc
780 yake_relationships = analyze_yake_phrase_relationships(yake_keywords, spacy_doc)
781
782 # Find keyword relationships for indexing
783 keyword_relationships = extract_keyword_relations(spacy_doc, message)
784
785 # Filter keywords using stop words
786 filtered_keyword_relationships = filter_keywords_by_stopwords(keyword_relationships)
787
788 # Extract named entities
789 # entities = extract_named_entities(message)
790
791 # Extract noun and verb phrases
792 #noun_phrases, verb_phrases = extract_phrases(message)
793
794 # Write original message
795 f.write(f"\n{'='*80}\n")
796 f.write(f"Message {count + 1}:\n")
797 f.write(message + "\n")
798 f.write('-' * 80 + "\n")
799
800 if verbose:
801 # Write RAKE keywords with lemmatization
802 print_keywords(rake_keywords, rake_time, f, "RAKE", lemmatized_data=rake_lemmatized)
803
804 # Write YAKE keywords with lemmatization
805 f.write("\n")
806 print_keywords(yake_keywords, yake_time, f, "YAKE", lemmatized_data=yake_lemmatized)
807
808 # Write KeyBERT keywords
809 f.write("\n")
810 print_keywords(keybert_keywords, keybert_time, f, "KeyBERT")
811
812 # Write dependencies
813 print_dependencies(dependencies, dep_time, f)
814
815 # Write keyword relationships for indexing (unfiltered)
816 print_keyword_relationships(keyword_relationships, f)
817
818 # Write filtered keyword relationships for indexing
819 print_keyword_relationships(filtered_keyword_relationships, f, header="Filtered Keywords (Stop Words Removed)")
820
821 # Extract and print flat word lists per sentence
822 sentence_word_lists = extract_sentence_word_lists(filtered_keyword_relationships)
823 f.write(f"\nSentence Word Lists (Keywords + Relations):\n")
824 for sent_list in sentence_word_lists:
825 f.write(f" {sent_list['words']}\n")
826
827 # Calculate and print word statistics
828 # Count unique words across all sentence lists
829 all_keywords_this_message = set()
830 for sent_list in sentence_word_lists:
831 all_keywords_this_message.update(sent_list['words'])
832
833 # Update global tracking
834 all_keywords.update(all_keywords_this_message)
835
836 # Count total words in original message
837 message_word_count = len(message.split())
838 total_words += message_word_count
839 unique_keyword_count = len(all_keywords_this_message)
840
841 f.write(f"\nWord Statistics:\n")
842 f.write(f" Total words in message: {message_word_count}\n")
843 f.write(f" Unique filtered keywords: {unique_keyword_count}\n")
844 if message_word_count > 0:
845 percentage = (unique_keyword_count / message_word_count) * 100
846 f.write(f" Keyword density: {percentage:.1f}%\n")
847 total_keyword_density += percentage
848
849 # Write timing comparison (only if verbose)
850 if verbose:
851 f.write(f"\nTiming Comparison:\n")
852 f.write(f" RAKE: {rake_time:.4f}sec\n")
853 f.write(f" YAKE: {yake_time:.4f}sec\n")
854 f.write(f" KeyBERT: {keybert_time:.4f}sec\n")
855 f.write(f" Dependencies: {dep_time:.4f}sec\n")
856 times = {'RAKE': rake_time, 'YAKE': yake_time, 'KeyBERT': keybert_time, 'Dependencies': dep_time}
857 fastest = min(times, key=times.get)
858 f.write(f" Fastest: {fastest}\n")
859
860 # Write named entities
861 #print_named_entities(entities, f)
862
863 # Write phrases
864 #print_phrases(noun_phrases, verb_phrases, f)
865
866 count += 1
867
868 # Print progress indicator every 50 messages
869 if count % 50 == 0:
870 print(f"Progress: Processed {count} messages...")
871
872 # Write overall timing summary
873 f.write(f"\n\n{'='*80}\n")
874 f.write(f"OVERALL TIMING SUMMARY ({count} messages):\n")
875 f.write(f"{'='*80}\n")
876 f.write(f"Total RAKE time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec per message)\n")
877 f.write(f"Total YAKE time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec per message)\n")
878 f.write(f"Total KeyBERT time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec per message)\n")
879 f.write(f"Total Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec per message)\n")
880
881 avg_keyword_density = total_keyword_density / count if count > 0 else 0
882 f.write(f"\nAverage keyword density: {avg_keyword_density:.1f}%\n")
883 f.write(f"\nTotal words across all messages: {total_words}\n")
884 f.write(f"Unique keywords across all messages: {len(all_keywords)}\n")
885 if total_words > 0:
886 overall_density = (len(all_keywords) / total_words) * 100
887 f.write(f"Overall keyword density: {overall_density:.1f}%\n")
888
889 total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time}
890 fastest = min(total_times, key=total_times.get)
891 slowest = max(total_times, key=total_times.get)
892 f.write(f"\nOverall fastest: {fastest}\n")
893 f.write(f"Overall slowest: {slowest}\n")
894 speedup = total_times[slowest] / total_times[fastest]
895 f.write(f"Speedup factor (fastest vs slowest): {speedup:.2f}x\n")
896
897print(f"Extraction complete. Results written to {output_file}")
898print(f"Processed {count} messages")
899print(f"RAKE total time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec)")
900print(f"YAKE total time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec)")
901print(f"KeyBERT total time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec)")
902print(f"Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec)")
903total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time}
904fastest = min(total_times, key=total_times.get)
905print(f"Overall fastest: {fastest}")
906