microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
add-terminal-ui-features-v2

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/fineTuning/unsloth/nltkExtract.py

918lines · modecode

1# Copyright (c) Microsoft Corporation and Henry Lucco.
2# Licensed under the MIT License.
3
4from rake_nltk import Rake
5import sys
6import os
7from argparse import ArgumentParser
8import nltk
9from nltk import word_tokenize, pos_tag, ne_chunk
10from nltk.tree import Tree
11import spacy
12import yake
13from keybert import KeyBERT
14
15parser = ArgumentParser(description="Extract keywords from dataset using NLTK-RAKE.")
16parser.add_argument("--dataset_path", type=str, default='/data/npr/npr_chunks_no_embedding.json', help="Path to the dataset file.")
17parser.add_argument("--max_length", type=int, default=1, help="Maximum number of words in a keyword phrase.")
18parser.add_argument("--output_file", type=str, default='extraction.txt', help="Path to the output file.")
19parser.add_argument("--verbose", action='store_true', help="Enable verbose output (shows all extraction details). Default is non-verbose.")
20parser.add_argument("--nogpu", action='store_true', help="Force KeyBERT to use CPU instead of GPU. Default is false (use GPU if available).")
21args = parser.parse_args(sys.argv[1:])
22dataset_path = args.dataset_path
23max_length = args.max_length
24output_file = args.output_file
25verbose = args.verbose
26nogpu = args.nogpu
27
28# Initialize RAKE with max_length configuration
29rake = Rake(max_length=max_length)
30
31# Initialize YAKE keyword extractor
32# Parameters: language, max_ngram_size, deduplication_threshold, number of keywords
33yake_extractor = yake.KeywordExtractor(lan="en", n=max_length, dedupLim=0.9, top=20)
34
35# Initialize KeyBERT model
36if nogpu:
37 from sentence_transformers import SentenceTransformer
38 # Force sentence transformer to use CPU
39 model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
40 keybert_model = KeyBERT(model=model)
41 print('Using KeyBERT on CPU')
42else:
43 keybert_model = KeyBERT()
44
45# Load spacy model with only necessary components
46# Keep: tok2vec, tagger, parser, lemmatizer
47# Disable: ner, attribute_ruler, and any other unused components
48if nogpu:
49 # Force spaCy to use CPU
50 spacy.require_cpu()
51 print('Using spaCy on CPU')
52nlp = spacy.load("en_core_web_sm", disable=["ner"])
53
54# Load an array of JSON objects with properties like speaker and content
55import json
56import time
57
58with open(dataset_path) as f:
59 rawData = json.load(f)
60
61def extract_keywords_rake(text):
62 """Extract keywords from text using RAKE."""
63 rake.extract_keywords_from_text(text)
64
65 # Get ranked phrases with scores (already limited by max_length configuration)
66 keywords = rake.get_ranked_phrases_with_scores()
67
68 return keywords
69
70def extract_keywords_yake(text):
71 """Extract keywords from text using YAKE."""
72 # YAKE returns (keyword, score) where lower scores are better
73 # Reverse to (score, keyword) to match RAKE format
74 keywords = [(score, keyword) for keyword, score in yake_extractor.extract_keywords(text)]
75
76 return keywords
77
78def extract_keywords_keybert(text):
79 """Extract keywords from text using KeyBERT."""
80 # KeyBERT returns (keyword, score) where higher scores are better
81 # Reverse to (score, keyword) to match RAKE format
82 keywords = keybert_model.extract_keywords(text, keyphrase_ngram_range=(1, max_length), top_n=20)
83 keywords = [(score, keyword) for keyword, score in keywords]
84
85 return keywords
86
87def extract_named_entities(text):
88 """Extract named entities from text using NLTK."""
89 # Tokenize and tag parts of speech
90 tokens = word_tokenize(text)
91 pos_tags = pos_tag(tokens)
92
93 # Extract named entities
94 named_entities = ne_chunk(pos_tags, binary=False)
95
96 # Parse the tree to extract entities with their types
97 entities = []
98 for chunk in named_entities:
99 if isinstance(chunk, Tree):
100 entity_type = chunk.label()
101 entity_text = " ".join([token for token, pos in chunk.leaves()])
102 entities.append((entity_text, entity_type))
103
104 return entities
105
106def extract_phrases(text):
107 """Extract noun phrases and verb phrases from text using spacy."""
108 doc = nlp(text)
109
110 # Extract noun phrases
111 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
112
113 # Extract verb phrases (tokens with verb POS and their dependents)
114 verb_phrases = []
115 for token in doc:
116 if token.pos_ == "VERB":
117 # Get the verb and its direct object/complement
118 phrase_tokens = [token.text]
119 for child in token.children:
120 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
121 phrase_tokens.append(child.text)
122 if len(phrase_tokens) > 1:
123 verb_phrases.append(" ".join(phrase_tokens))
124 else:
125 verb_phrases.append(token.text)
126
127 return noun_phrases, verb_phrases
128
129def extract_dependencies(text):
130 """Extract dependency relations from text using spacy with sentence boundaries."""
131 doc = nlp(text)
132
133 # Process each sentence separately for better dependency analysis
134 sentences_deps = []
135 for sent in doc.sents:
136 sent_deps = []
137 verb_relations = []
138 nouns_in_verb_relations = set()
139
140 for token in sent:
141 # Extract: token, POS tag, dependency relation, head token
142 dep_info = {
143 'token': token.text,
144 'pos': token.pos_,
145 'dep': token.dep_,
146 'head': token.head.text,
147 'head_pos': token.head.pos_
148 }
149 sent_deps.append(dep_info)
150
151 # If token is a verb, extract its NOUN and ADV dependencies
152 if token.pos_ == "VERB":
153 nouns = []
154 advs = []
155 for child in token.children:
156 if child.pos_ == "NOUN" or child.pos_ == "PROPN":
157 nouns.append(child.text)
158 nouns_in_verb_relations.add(child.text)
159 elif child.pos_ == "ADV":
160 advs.append(child.text)
161
162 verb_relations.append({
163 'verb': token.text,
164 'nouns': nouns,
165 'advs': advs
166 })
167
168 # Find nouns that are NOT dependent on any verb
169 independent_nouns = []
170 for token in sent:
171 if (token.pos_ == "NOUN" or token.pos_ == "PROPN") and token.text not in nouns_in_verb_relations:
172 noun_info = {
173 'noun': token.text,
174 'dep': token.dep_,
175 'head': token.head.text,
176 'head_pos': token.head.pos_
177 }
178
179 # If the noun depends on a preposition, also show what the preposition depends on
180 if token.head.pos_ == "ADP":
181 noun_info['prep_head'] = token.head.head.text
182 noun_info['prep_head_pos'] = token.head.head.pos_
183 noun_info['prep_dep'] = token.head.dep_
184
185 independent_nouns.append(noun_info)
186
187 sentences_deps.append({
188 'sentence': sent.text,
189 'dependencies': sent_deps,
190 'verb_relations': verb_relations,
191 'independent_nouns': independent_nouns
192 })
193
194 return sentences_deps, doc
195
196def lemmatize_rake_keyphrases_from_doc(rake_keywords, doc):
197 """Lemmatize RAKE keyphrases using existing spacy doc by finding token offsets."""
198 lemmatized = []
199
200 for score, keyphrase in rake_keywords:
201 keyphrase_lower = keyphrase.lower()
202 keyphrase_tokens = keyphrase_lower.split()
203
204 # Try to find matching sequence of tokens in doc
205 matched_tokens = []
206 doc_tokens = [t for t in doc if not t.is_punct]
207
208 for i in range(len(doc_tokens)):
209 # Check if we have a match starting at position i
210 temp_tokens = []
211
212 for j, kp_token in enumerate(keyphrase_tokens):
213 if i + j < len(doc_tokens):
214 if doc_tokens[i + j].text.lower() == kp_token:
215 temp_tokens.append(doc_tokens[i + j])
216 else:
217 temp_tokens.clear()
218 break
219 else:
220 temp_tokens.clear()
221 break
222
223 if len(temp_tokens) == len(keyphrase_tokens):
224 matched_tokens = temp_tokens
225 break
226
227 # Get lemmas from matched tokens
228 if matched_tokens:
229 lemmas = [token.lemma_ for token in matched_tokens]
230 lemmatized_phrase = " ".join(lemmas)
231 else:
232 # If no match found, just keep the original lowercased
233 lemmatized_phrase = keyphrase_lower
234
235 lemmatized.append((score, keyphrase, lemmatized_phrase))
236
237 return lemmatized
238
239def lemmatize_yake_keyphrases_from_doc(yake_keywords, doc):
240 """Lemmatize YAKE keyphrases using existing spacy doc by finding token offsets."""
241 lemmatized = []
242
243 for score, keyphrase in yake_keywords:
244 keyphrase_lower = keyphrase.lower()
245 keyphrase_tokens = keyphrase_lower.split()
246
247 # Try to find matching sequence of tokens in doc
248 matched_tokens = []
249 doc_tokens = [t for t in doc if not t.is_punct]
250
251 for i in range(len(doc_tokens)):
252 # Check if we have a match starting at position i
253 temp_tokens = []
254
255 for j, kp_token in enumerate(keyphrase_tokens):
256 if i + j < len(doc_tokens):
257 if doc_tokens[i + j].text.lower() == kp_token:
258 temp_tokens.append(doc_tokens[i + j])
259 else:
260 temp_tokens.clear()
261 break
262 else:
263 temp_tokens.clear()
264 break
265
266 if len(temp_tokens) == len(keyphrase_tokens):
267 matched_tokens = temp_tokens
268 break
269
270 # Get lemmas from matched tokens
271 if matched_tokens:
272 lemmas = [token.lemma_ for token in matched_tokens]
273 lemmatized_phrase = " ".join(lemmas)
274 else:
275 # If no match found, just keep the original lowercased
276 lemmatized_phrase = keyphrase_lower
277
278 lemmatized.append((score, keyphrase, lemmatized_phrase))
279
280 return lemmatized
281
282def analyze_rake_phrase_relationships(rake_keywords, doc):
283 """Analyze relationships between RAKE keyphrases using spacy doc."""
284 # Extract just the keyword text (not scores)
285 rake_phrases = [keyword.lower() for score, keyword in rake_keywords]
286
287 # Get noun and verb phrases from doc
288 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
289
290 verb_phrases = []
291 for token in doc:
292 if token.pos_ == "VERB":
293 phrase_tokens = [token.text]
294 for child in token.children:
295 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
296 phrase_tokens.append(child.text)
297 if len(phrase_tokens) > 1:
298 verb_phrases.append(" ".join(phrase_tokens))
299
300 # Find phrases that contain 2+ RAKE keyphrases
301 relationships = []
302
303 for np in noun_phrases:
304 np_lower = np.lower()
305 matching_keyphrases = [kp for kp in rake_phrases if kp in np_lower]
306 if len(matching_keyphrases) >= 2:
307 relationships.append({
308 'type': 'noun phrase',
309 'phrase': np,
310 'keyphrases': matching_keyphrases
311 })
312
313 for vp in verb_phrases:
314 vp_lower = vp.lower()
315 matching_keyphrases = [kp for kp in rake_phrases if kp in vp_lower]
316 if len(matching_keyphrases) >= 2:
317 relationships.append({
318 'type': 'verb phrase',
319 'phrase': vp,
320 'keyphrases': matching_keyphrases
321 })
322
323 return relationships
324
325def analyze_yake_phrase_relationships(yake_keywords, doc):
326 """Analyze relationships between YAKE keyphrases using spacy doc."""
327 # Extract just the keyword text (not scores)
328 yake_phrases = [keyword.lower() for score, keyword in yake_keywords]
329
330 # Get noun and verb phrases from doc
331 noun_phrases = [chunk.text for chunk in doc.noun_chunks]
332
333 verb_phrases = []
334 for token in doc:
335 if token.pos_ == "VERB":
336 phrase_tokens = [token.text]
337 for child in token.children:
338 if child.dep_ in ("dobj", "attr", "prep", "pobj", "advmod", "aux", "auxpass", "neg"):
339 phrase_tokens.append(child.text)
340 if len(phrase_tokens) > 1:
341 verb_phrases.append(" ".join(phrase_tokens))
342
343 # Find phrases that contain 2+ YAKE keyphrases
344 relationships = []
345
346 for np in noun_phrases:
347 np_lower = np.lower()
348 matching_keyphrases = [kp for kp in yake_phrases if kp in np_lower]
349 if len(matching_keyphrases) >= 2:
350 relationships.append({
351 'type': 'noun phrase',
352 'phrase': np,
353 'keyphrases': matching_keyphrases
354 })
355
356 for vp in verb_phrases:
357 vp_lower = vp.lower()
358 matching_keyphrases = [kp for kp in yake_phrases if kp in vp_lower]
359 if len(matching_keyphrases) >= 2:
360 relationships.append({
361 'type': 'verb phrase',
362 'phrase': vp,
363 'keyphrases': matching_keyphrases
364 })
365
366 return relationships
367
368def extract_keyword_relations(doc, message):
369 """Extract all NOUNS and VERBS as keywords, with up to 2 most important related words each."""
370 # Process each sentence separately
371 sentence_keywords = []
372
373 for sent in doc.sents:
374 keywords_data = []
375
376 for token in sent:
377 # Only process NOUNS, PROPN, and VERBS
378 if token.pos_ not in ['NOUN', 'PROPN', 'VERB']:
379 continue
380
381 keyword_info = {
382 'keyword': token.text,
383 'pos': token.pos_,
384 'lemma': token.lemma_,
385 'relations': []
386 }
387
388 # Collect potential related words with priority scores
389 candidates = []
390
391 if token.pos_ == 'VERB':
392 # For verbs, prioritize: subject, object, adverbs, prepositional objects
393 for child in token.children:
394 if child.dep_ in ['nsubj', 'nsubjpass']: # Subject
395 candidates.append((3, child, child.dep_, 'subject'))
396 elif child.dep_ in ['dobj', 'attr']: # Direct object
397 candidates.append((3, child, child.dep_, 'object'))
398 elif child.dep_ == 'advmod': # Adverb
399 candidates.append((2, child, child.dep_, 'adverb'))
400 elif child.dep_ == 'prep': # Preposition
401 # Get the object of the preposition
402 for pchild in child.children:
403 if pchild.dep_ == 'pobj':
404 candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj'))
405 elif child.dep_ in ['iobj', 'dative']: # Indirect object
406 candidates.append((2, child, child.dep_, 'indirect_obj'))
407
408 elif token.pos_ in ['NOUN', 'PROPN']:
409 # For nouns, prioritize: adjectives, compound nouns, prep phrases, possessives
410 for child in token.children:
411 if child.dep_ == 'amod': # Adjective modifier
412 candidates.append((3, child, child.dep_, 'adjective'))
413 elif child.dep_ == 'compound': # Compound noun
414 candidates.append((3, child, child.dep_, 'compound'))
415 elif child.dep_ == 'prep': # Preposition
416 # Get the object of the preposition
417 for pchild in child.children:
418 if pchild.dep_ == 'pobj':
419 candidates.append((2, pchild, f'{child.text}_{pchild.dep_}', 'prep_obj'))
420 elif child.dep_ in ['poss', 'nmod']: # Possessive or nominal modifier
421 candidates.append((2, child, child.dep_, 'modifier'))
422
423 # Also check if this noun is dependent on something important
424 if token.head.pos_ == 'VERB':
425 if token.dep_ in ['nsubj', 'nsubjpass', 'dobj', 'attr']:
426 candidates.append((1, token.head, token.dep_, 'verb_relation'))
427 elif token.head.pos_ == 'ADP': # Preposition
428 # Find what the preposition connects to
429 if token.head.head.pos_ in ['NOUN', 'PROPN', 'VERB']:
430 candidates.append((1, token.head.head, f'via_{token.head.text}', 'prep_head'))
431
432 # Sort by priority (higher first) and take top 2
433 candidates.sort(key=lambda x: x[0], reverse=True)
434 for priority, related_token, relation, rel_type in candidates[:2]:
435 keyword_info['relations'].append({
436 'word': related_token.text,
437 'lemma': related_token.lemma_,
438 'relation': relation,
439 'type': rel_type,
440 'pos': related_token.pos_
441 })
442
443 keywords_data.append(keyword_info)
444
445 if keywords_data:
446 sentence_keywords.append({
447 'sentence': sent.text,
448 'keywords': keywords_data
449 })
450
451 return sentence_keywords
452
453def filter_keywords_by_stopwords(keyword_relations):
454 """
455 Filter out common/generic keywords using an expanded stop word list.
456 Always keeps proper nouns (PROPN).
457
458 Args:
459 keyword_relations: List of sentence dicts with keywords
460
461 Returns:
462 Filtered keyword_relations without generic keywords
463 """
464 # Common verbs to exclude
465 stop_verbs = {
466 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see',
467 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask',
468 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show',
469 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run',
470 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write',
471 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue',
472 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop',
473 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk',
474 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve',
475 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill',
476 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide'
477 }
478
479 # Common nouns to exclude
480 stop_nouns = {
481 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman',
482 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company',
483 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home',
484 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot',
485 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind',
486 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game',
487 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president',
488 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face',
489 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history',
490 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy',
491 'moment', 'air', 'teacher', 'force', 'education'
492 }
493
494 filtered_relations = []
495 for sent_info in keyword_relations:
496 filtered_keywords = []
497 for kw in sent_info['keywords']:
498 kept = False
499 # Always keep proper nouns
500 if kw['pos'] == 'PROPN':
501 filtered_keywords.append(kw)
502 kept = True
503 # Filter verbs against stop list
504 elif kw['pos'] == 'VERB' and kw['lemma'].lower() not in stop_verbs:
505 filtered_keywords.append(kw)
506 kept = True
507 # Filter nouns against stop list
508 elif kw['pos'] == 'NOUN' and kw['lemma'].lower() not in stop_nouns:
509 filtered_keywords.append(kw)
510 kept = True
511
512 if filtered_keywords:
513 filtered_relations.append({
514 'sentence': sent_info['sentence'],
515 'keywords': filtered_keywords
516 })
517
518 return filtered_relations
519
520def extract_sentence_word_lists(filtered_keyword_relations):
521 """
522 Extract flat lists of words per sentence from filtered keywords.
523 Returns lemmatized keywords + their related words, filtered to remove pronouns and stop words.
524
525 Args:
526 filtered_keyword_relations: Filtered list of sentence dicts with keywords
527
528 Returns:
529 List of dicts with 'sentence' and 'words' (list of unique lemmas)
530 """
531 # Common verbs to exclude
532 stop_verbs = {
533 'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see',
534 'come', 'think', 'look', 'want', 'give', 'use', 'find', 'tell', 'ask',
535 'work', 'seem', 'feel', 'try', 'leave', 'call', 'need', 'become', 'show',
536 'mean', 'keep', 'let', 'begin', 'help', 'talk', 'turn', 'start', 'run',
537 'move', 'like', 'live', 'believe', 'hold', 'bring', 'happen', 'write',
538 'provide', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue',
539 'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop',
540 'create', 'speak', 'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk',
541 'win', 'offer', 'remember', 'consider', 'appear', 'buy', 'wait', 'serve',
542 'die', 'send', 'expect', 'build', 'stay', 'fall', 'cut', 'reach', 'kill',
543 'remain', 'suggest', 'raise', 'pass', 'sell', 'require', 'report', 'decide'
544 }
545
546 # Common nouns to exclude
547 stop_nouns = {
548 'thing', 'time', 'year', 'way', 'day', 'man', 'people', 'person', 'woman',
549 'life', 'child', 'world', 'hand', 'part', 'place', 'case', 'week', 'company',
550 'system', 'program', 'question', 'work', 'number', 'night', 'point', 'home',
551 'water', 'room', 'mother', 'area', 'money', 'story', 'fact', 'month', 'lot',
552 'right', 'study', 'book', 'eye', 'job', 'word', 'issue', 'side', 'kind',
553 'head', 'house', 'service', 'friend', 'father', 'power', 'hour', 'game',
554 'line', 'end', 'member', 'law', 'car', 'city', 'community', 'name', 'president',
555 'team', 'minute', 'idea', 'body', 'information', 'back', 'parent', 'face',
556 'others', 'level', 'office', 'door', 'health', 'art', 'war', 'history',
557 'party', 'result', 'change', 'morning', 'reason', 'research', 'girl', 'guy',
558 'moment', 'air', 'teacher', 'force', 'education'
559 }
560
561 # Pronouns to exclude (lemmatized forms)
562 pronouns = {
563 'i', 'you', 'he', 'she', 'it', 'we', 'they',
564 'me', 'him', 'her', 'us', 'them',
565 'my', 'your', 'his', 'her', 'its', 'our', 'their',
566 'mine', 'yours', 'hers', 'ours', 'theirs',
567 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
568 'this', 'that', 'these', 'those',
569 'who', 'whom', 'whose', 'which', 'what',
570 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything',
571 'nobody', 'nothing', 'somebody', 'someone', 'something'
572 }
573
574 sentence_lists = []
575
576 for sent_info in filtered_keyword_relations:
577 words = set()
578
579 for kw in sent_info['keywords']:
580 lemma_lower = kw['lemma'].lower()
581 pos = kw['pos']
582
583 # Skip pronouns
584 if pos == 'PRON' or lemma_lower in pronouns:
585 continue
586
587 # Skip stop words (but keep PROPN)
588 if pos == 'PROPN':
589 words.add(kw['lemma'])
590 elif pos == 'VERB' and lemma_lower not in stop_verbs:
591 words.add(kw['lemma'])
592 elif pos == 'NOUN' and lemma_lower not in stop_nouns:
593 words.add(kw['lemma'])
594
595 # Add filtered related words
596 for rel in kw['relations']:
597 rel_lemma_lower = rel['lemma'].lower()
598 rel_pos = rel['pos']
599
600 # Skip pronouns
601 if rel_pos == 'PRON' or rel_lemma_lower in pronouns:
602 continue
603
604 # Skip stop words (but keep PROPN)
605 if rel_pos == 'PROPN':
606 words.add(rel['lemma'])
607 elif rel_pos == 'VERB' and rel_lemma_lower not in stop_verbs:
608 words.add(rel['lemma'])
609 elif rel_pos == 'NOUN' and rel_lemma_lower not in stop_nouns:
610 words.add(rel['lemma'])
611 # Also keep adjectives and adverbs
612 elif rel_pos in ['ADJ', 'ADV']:
613 words.add(rel['lemma'])
614
615 sentence_lists.append({
616 'sentence': sent_info['sentence'],
617 'words': sorted(list(words)) # Sort for consistent output
618 })
619
620 return sentence_lists
621
622def is_common_phrase(phrase):
623 """Check if phrase contains only common words (no proper nouns or unusual words)."""
624 doc = nlp(phrase)
625 for token in doc:
626 # Skip punctuation
627 if token.is_punct:
628 continue
629 # Check if it's a proper noun
630 if token.pos_ == "PROPN":
631 return False
632 # Check if it's an unusual word (not in common vocabulary)
633 # Use combination of: not a stop word, and has low frequency rank
634 if not token.is_stop and token.is_alpha:
635 # If word has very high rank (rare) or no rank info, consider it unusual
636 if token.rank > 10000 or token.rank == 0:
637 return False
638 return True
639
640def print_keywords(keywords, time_taken, file, extractor_name="RAKE", lemmatized_data=None):
641 """Write extracted keywords to file."""
642 if keywords:
643 file.write(f"{extractor_name} Keywords:\n")
644
645 if lemmatized_data:
646 # Print with lemmatized forms
647 for score, original, lemmatized in lemmatized_data:
648 lemma_suffix = f" → {lemmatized}" if lemmatized != original.lower() else ""
649 file.write(f" - {original} (score: {score:.2f}){lemma_suffix}\n")
650 else:
651 # Print without lemmatized forms
652 for score, word in keywords:
653 file.write(f" - {word} (score: {score:.2f})\n")
654
655 file.write(f"Total keywords: {len(keywords)} extracted in {time_taken:.4f} seconds\n")
656 else:
657 file.write(f"No {extractor_name} keywords extracted\n")
658
659def print_named_entities(entities, file):
660 """Write extracted named entities to file."""
661 if entities:
662 file.write("\nNamed Entities:\n")
663 for entity_text, entity_type in entities:
664 file.write(f" - {entity_text} ({entity_type})\n")
665 file.write(f"Total named entities: {len(entities)}\n")
666 else:
667 file.write("\nNo named entities extracted\n")
668
669def print_phrases(noun_phrases, verb_phrases, file):
670 """Write extracted phrases to file."""
671 file.write("\nNoun Phrases:\n")
672 if noun_phrases:
673 for phrase in noun_phrases:
674 file.write(f" - {phrase}\n")
675 file.write(f"Total noun phrases: {len(noun_phrases)}\n")
676 else:
677 file.write(" No noun phrases extracted\n")
678
679 file.write("\nVerb Phrases:\n")
680 if verb_phrases:
681 for phrase in verb_phrases:
682 file.write(f" - {phrase}\n")
683 file.write(f"Total verb phrases: {len(verb_phrases)}\n")
684 else:
685 file.write(" No verb phrases extracted\n")
686
687def print_dependencies(sentences_deps, time_taken, file):
688 """Write dependency relations to file."""
689 file.write(f"\nDependency Relations (extracted in {time_taken:.4f}sec):\n")
690 if sentences_deps:
691 for i, sent_info in enumerate(sentences_deps, 1):
692 file.write(f"\n Sentence {i}: {sent_info['sentence']}\n")
693
694 # Print all dependencies first
695 file.write(" Dependencies:\n")
696 for dep in sent_info['dependencies']:
697 file.write(f" {dep['token']} ({dep['pos']}) --[{dep['dep']}]--> {dep['head']} ({dep['head_pos']})\n")
698
699 # Print verb relations (verb with its noun and adverb dependencies)
700 if sent_info.get('verb_relations'):
701 file.write(" Verb Relations:\n")
702 for vr in sent_info['verb_relations']:
703 nouns_str = ', '.join(vr['nouns']) if vr['nouns'] else 'none'
704 advs_str = ', '.join(vr['advs']) if vr['advs'] else 'none'
705 file.write(f" VERB: {vr['verb']} | NOUNs: {nouns_str} | ADVs: {advs_str}\n")
706
707 # Print independent nouns (not dependent on any verb)
708 if sent_info.get('independent_nouns'):
709 file.write(" Independent NOUNs (not verb-dependent):\n")
710 for noun_info in sent_info['independent_nouns']:
711 file.write(f" {noun_info['noun']} --[{noun_info['dep']}]--> {noun_info['head']} ({noun_info['head_pos']})")
712 # If noun depends on preposition, show what preposition depends on
713 if 'prep_head' in noun_info:
714 file.write(f" --[{noun_info['prep_dep']}]--> {noun_info['prep_head']} ({noun_info['prep_head_pos']})")
715 file.write("\n")
716
717 file.write(f"\nTotal sentences: {len(sentences_deps)}\n")
718 else:
719 file.write(" No dependencies extracted\n")
720
721def print_keyword_relationships(keyword_relations, file, header="Keyword Relationships (All NOUNs and VERBs)"):
722 """Write keyword relationship analysis to file."""
723 file.write(f"\n{header}:\n")
724
725 if not keyword_relations:
726 file.write(" No keywords found\n")
727 return
728
729 for sent_info in keyword_relations:
730 file.write(f"\n Sentence: {sent_info['sentence']}\n")
731 file.write(f" Keywords with relations:\n")
732
733 for kw in sent_info['keywords']:
734 related_words = []
735 for rel in kw['relations']:
736 related_words.append(f"{rel['lemma']}({rel['type']})")
737
738 related_str = ', '.join(related_words) if related_words else 'none'
739 file.write(f" {kw['lemma']} [{kw['pos']}]: {related_str}\n")
740
741count = 0
742rake_total_time = 0
743yake_total_time = 0
744keybert_total_time = 0
745dep_total_time = 0
746total_keyword_density = 0.0
747all_keywords = set() # Track unique keywords across all messages
748total_words = 0 # Track total words across all messages
749
750with open(output_file, 'w', encoding='utf-8') as f:
751 for item in rawData:
752 # Get message text
753 message = item['speaker'] + ": " + item['content']
754
755 # Time RAKE extraction
756 rake_start = time.time()
757 rake_keywords = extract_keywords_rake(message)
758 rake_end = time.time()
759 rake_time = rake_end - rake_start
760 rake_total_time += rake_time
761
762 # Time YAKE extraction
763 yake_start = time.time()
764 yake_keywords = extract_keywords_yake(message)
765 yake_end = time.time()
766 yake_time = yake_end - yake_start
767 yake_total_time += yake_time
768
769 # Time KeyBERT extraction
770 keybert_start = time.time()
771 keybert_keywords = extract_keywords_keybert(message)
772 keybert_end = time.time()
773 keybert_time = keybert_end - keybert_start
774 keybert_total_time += keybert_time
775
776 # Time dependency extraction (also returns doc for analysis)
777 dep_start = time.time()
778 dependencies, spacy_doc = extract_dependencies(message)
779 dep_end = time.time()
780 dep_time = dep_end - dep_start
781 dep_total_time += dep_time
782
783 # Lemmatize RAKE keyphrases using the already-computed spacy doc
784 rake_lemmatized = lemmatize_rake_keyphrases_from_doc(rake_keywords, spacy_doc)
785
786 # Lemmatize YAKE keyphrases using the already-computed spacy doc
787 yake_lemmatized = lemmatize_yake_keyphrases_from_doc(yake_keywords, spacy_doc)
788
789 # Analyze RAKE keyphrase relationships using the spacy doc
790 rake_relationships = analyze_rake_phrase_relationships(rake_keywords, spacy_doc)
791
792 # Analyze YAKE keyphrase relationships using the spacy doc
793 yake_relationships = analyze_yake_phrase_relationships(yake_keywords, spacy_doc)
794
795 # Find keyword relationships for indexing
796 keyword_relationships = extract_keyword_relations(spacy_doc, message)
797
798 # Filter keywords using stop words
799 filtered_keyword_relationships = filter_keywords_by_stopwords(keyword_relationships)
800
801 # Extract named entities
802 # entities = extract_named_entities(message)
803
804 # Extract noun and verb phrases
805 #noun_phrases, verb_phrases = extract_phrases(message)
806
807 # Write original message
808 f.write(f"\n{'='*80}\n")
809 f.write(f"Message {count + 1}:\n")
810 f.write(message + "\n")
811 f.write('-' * 80 + "\n")
812
813 if verbose:
814 # Write RAKE keywords with lemmatization
815 print_keywords(rake_keywords, rake_time, f, "RAKE", lemmatized_data=rake_lemmatized)
816
817 # Write YAKE keywords with lemmatization
818 f.write("\n")
819 print_keywords(yake_keywords, yake_time, f, "YAKE", lemmatized_data=yake_lemmatized)
820
821 # Write KeyBERT keywords
822 f.write("\n")
823 print_keywords(keybert_keywords, keybert_time, f, "KeyBERT")
824
825 # Write dependencies
826 print_dependencies(dependencies, dep_time, f)
827
828 # Write keyword relationships for indexing (unfiltered)
829 print_keyword_relationships(keyword_relationships, f)
830
831 # Write filtered keyword relationships for indexing
832 print_keyword_relationships(filtered_keyword_relationships, f, header="Filtered Keywords (Stop Words Removed)")
833
834 # Extract and print flat word lists per sentence
835 sentence_word_lists = extract_sentence_word_lists(filtered_keyword_relationships)
836 f.write(f"\nSentence Word Lists (Keywords + Relations):\n")
837 for sent_list in sentence_word_lists:
838 f.write(f" {sent_list['words']}\n")
839
840 # Calculate and print word statistics
841 # Count unique words across all sentence lists
842 all_keywords_this_message = set()
843 for sent_list in sentence_word_lists:
844 all_keywords_this_message.update(sent_list['words'])
845
846 # Update global tracking
847 all_keywords.update(all_keywords_this_message)
848
849 # Count total words in original message
850 message_word_count = len(message.split())
851 total_words += message_word_count
852 unique_keyword_count = len(all_keywords_this_message)
853
854 f.write(f"\nWord Statistics:\n")
855 f.write(f" Total words in message: {message_word_count}\n")
856 f.write(f" Unique filtered keywords: {unique_keyword_count}\n")
857 if message_word_count > 0:
858 percentage = (unique_keyword_count / message_word_count) * 100
859 f.write(f" Keyword density: {percentage:.1f}%\n")
860 total_keyword_density += percentage
861
862 # Write timing comparison (only if verbose)
863 if verbose:
864 f.write(f"\nTiming Comparison:\n")
865 f.write(f" RAKE: {rake_time:.4f}sec\n")
866 f.write(f" YAKE: {yake_time:.4f}sec\n")
867 f.write(f" KeyBERT: {keybert_time:.4f}sec\n")
868 f.write(f" Dependencies: {dep_time:.4f}sec\n")
869 times = {'RAKE': rake_time, 'YAKE': yake_time, 'KeyBERT': keybert_time, 'Dependencies': dep_time}
870 fastest = min(times, key=times.get)
871 f.write(f" Fastest: {fastest}\n")
872
873 # Write named entities
874 #print_named_entities(entities, f)
875
876 # Write phrases
877 #print_phrases(noun_phrases, verb_phrases, f)
878
879 count += 1
880
881 # Print progress indicator every 50 messages
882 if count % 50 == 0:
883 print(f"Progress: Processed {count} messages...")
884
885 # Write overall timing summary
886 f.write(f"\n\n{'='*80}\n")
887 f.write(f"OVERALL TIMING SUMMARY ({count} messages):\n")
888 f.write(f"{'='*80}\n")
889 f.write(f"Total RAKE time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec per message)\n")
890 f.write(f"Total YAKE time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec per message)\n")
891 f.write(f"Total KeyBERT time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec per message)\n")
892 f.write(f"Total Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec per message)\n")
893
894 avg_keyword_density = total_keyword_density / count if count > 0 else 0
895 f.write(f"\nAverage keyword density: {avg_keyword_density:.1f}%\n")
896 f.write(f"\nTotal words across all messages: {total_words}\n")
897 f.write(f"Unique keywords across all messages: {len(all_keywords)}\n")
898 if total_words > 0:
899 overall_density = (len(all_keywords) / total_words) * 100
900 f.write(f"Overall keyword density: {overall_density:.1f}%\n")
901
902 total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time}
903 fastest = min(total_times, key=total_times.get)
904 slowest = max(total_times, key=total_times.get)
905 f.write(f"\nOverall fastest: {fastest}\n")
906 f.write(f"Overall slowest: {slowest}\n")
907 speedup = total_times[slowest] / total_times[fastest]
908 f.write(f"Speedup factor (fastest vs slowest): {speedup:.2f}x\n")
909
910print(f"Extraction complete. Results written to {output_file}")
911print(f"Processed {count} messages")
912print(f"RAKE total time: {rake_total_time:.4f}sec (avg: {rake_total_time/count:.4f}sec)")
913print(f"YAKE total time: {yake_total_time:.4f}sec (avg: {yake_total_time/count:.4f}sec)")
914print(f"KeyBERT total time: {keybert_total_time:.4f}sec (avg: {keybert_total_time/count:.4f}sec)")
915print(f"Dependency time: {dep_total_time:.4f}sec (avg: {dep_total_time/count:.4f}sec)")
916total_times = {'RAKE': rake_total_time, 'YAKE': yake_total_time, 'KeyBERT': keybert_total_time, 'Dependencies': dep_total_time}
917fastest = min(total_times, key=total_times.get)
918print(f"Overall fastest: {fastest}")
919