microsoft/AI-For-Beginners
Publicmirrored fromhttps://github.com/microsoft/AI-For-BeginnersAvailable
examples/04-text-sentiment.py
268lines ยท modecode
| 1 | """ |
| 2 | Simple Text Sentiment Analysis |
| 3 | ================================ |
| 4 | |
| 5 | This example shows how to analyze the sentiment (emotion) of text. |
| 6 | It's a simplified version that teaches NLP concepts without complex libraries. |
| 7 | |
| 8 | What you'll learn: |
| 9 | - Text preprocessing (cleaning and preparing text) |
| 10 | - Feature extraction (converting words to numbers) |
| 11 | - Sentiment classification (positive vs negative) |
| 12 | |
| 13 | Use case: Determine if a movie review is positive or negative. |
| 14 | """ |
| 15 | |
| 16 | import re |
| 17 | from collections import Counter |
| 18 | |
| 19 | class SimpleSentimentAnalyzer: |
| 20 | """ |
| 21 | A basic sentiment analyzer that learns from labeled examples. |
| 22 | |
| 23 | How it works: |
| 24 | 1. Learns which words appear more in positive vs negative texts |
| 25 | 2. Calculates a "sentiment score" for each word |
| 26 | 3. Uses these scores to predict sentiment of new text |
| 27 | """ |
| 28 | |
| 29 | def __init__(self): |
| 30 | # Store word scores (positive words get positive scores) |
| 31 | self.word_scores = {} |
| 32 | # Track if we've trained |
| 33 | self.is_trained = False |
| 34 | |
| 35 | def preprocess_text(self, text): |
| 36 | """ |
| 37 | Clean and prepare text for analysis. |
| 38 | |
| 39 | Steps: |
| 40 | 1. Convert to lowercase |
| 41 | 2. Remove punctuation |
| 42 | 3. Split into words |
| 43 | |
| 44 | Args: |
| 45 | text: Raw text string |
| 46 | |
| 47 | Returns: |
| 48 | List of cleaned words |
| 49 | """ |
| 50 | # Convert to lowercase |
| 51 | text = text.lower() |
| 52 | |
| 53 | # Remove punctuation and special characters |
| 54 | text = re.sub(r'[^a-z\s]', '', text) |
| 55 | |
| 56 | # Split into words |
| 57 | words = text.split() |
| 58 | |
| 59 | # Remove very short words (like "a", "i") |
| 60 | words = [w for w in words if len(w) > 2] |
| 61 | |
| 62 | return words |
| 63 | |
| 64 | def train(self, training_data): |
| 65 | """ |
| 66 | Learn sentiment patterns from labeled examples. |
| 67 | |
| 68 | Args: |
| 69 | training_data: List of (text, sentiment) tuples |
| 70 | where sentiment is 'positive' or 'negative' |
| 71 | """ |
| 72 | print("๐ Training sentiment analyzer...") |
| 73 | |
| 74 | # Count words in positive and negative texts |
| 75 | positive_words = Counter() |
| 76 | negative_words = Counter() |
| 77 | |
| 78 | for text, sentiment in training_data: |
| 79 | words = self.preprocess_text(text) |
| 80 | |
| 81 | if sentiment == 'positive': |
| 82 | positive_words.update(words) |
| 83 | else: |
| 84 | negative_words.update(words) |
| 85 | |
| 86 | # Calculate sentiment score for each word |
| 87 | # Score > 0 means more positive, < 0 means more negative |
| 88 | all_words = set(positive_words.keys()) | set(negative_words.keys()) |
| 89 | |
| 90 | for word in all_words: |
| 91 | pos_count = positive_words[word] |
| 92 | neg_count = negative_words[word] |
| 93 | |
| 94 | # Calculate score: difference in appearances |
| 95 | # Add smoothing (+1) to avoid division by zero |
| 96 | total = pos_count + neg_count |
| 97 | self.word_scores[word] = (pos_count - neg_count) / (total + 1) |
| 98 | |
| 99 | self.is_trained = True |
| 100 | |
| 101 | # Show some learned words |
| 102 | print(f"โ
Learned sentiment for {len(self.word_scores)} words") |
| 103 | print("\n๐ Most positive words:") |
| 104 | sorted_words = sorted(self.word_scores.items(), key=lambda x: x[1], reverse=True) |
| 105 | for word, score in sorted_words[:5]: |
| 106 | print(f" '{word}': {score:+.3f}") |
| 107 | |
| 108 | print("\n๐ Most negative words:") |
| 109 | for word, score in sorted_words[-5:]: |
| 110 | print(f" '{word}': {score:+.3f}") |
| 111 | |
| 112 | def analyze(self, text): |
| 113 | """ |
| 114 | Predict the sentiment of new text. |
| 115 | |
| 116 | Args: |
| 117 | text: Text to analyze |
| 118 | |
| 119 | Returns: |
| 120 | Tuple of (sentiment, confidence, score) |
| 121 | """ |
| 122 | if not self.is_trained: |
| 123 | raise Exception("Please train the analyzer first!") |
| 124 | |
| 125 | # Preprocess text |
| 126 | words = self.preprocess_text(text) |
| 127 | |
| 128 | # Calculate total sentiment score |
| 129 | total_score = 0 |
| 130 | word_count = 0 |
| 131 | |
| 132 | for word in words: |
| 133 | if word in self.word_scores: |
| 134 | total_score += self.word_scores[word] |
| 135 | word_count += 1 |
| 136 | |
| 137 | # Average score |
| 138 | if word_count > 0: |
| 139 | avg_score = total_score / word_count |
| 140 | else: |
| 141 | avg_score = 0 |
| 142 | |
| 143 | # Determine sentiment and confidence |
| 144 | sentiment = "positive" if avg_score > 0 else "negative" |
| 145 | confidence = min(abs(avg_score) * 100, 100) # Convert to percentage |
| 146 | |
| 147 | return sentiment, confidence, avg_score |
| 148 | |
| 149 | |
| 150 | def create_training_data(): |
| 151 | """ |
| 152 | Create sample training data (movie reviews with labels). |
| 153 | |
| 154 | In a real application, you'd have thousands of examples! |
| 155 | |
| 156 | Returns: |
| 157 | List of (review_text, sentiment) tuples |
| 158 | """ |
| 159 | return [ |
| 160 | # Positive reviews |
| 161 | ("This movie was absolutely amazing and wonderful! I loved every minute.", "positive"), |
| 162 | ("Brilliant performance! The acting was superb and the story captivating.", "positive"), |
| 163 | ("Fantastic film! Highly recommend to everyone. Best movie of the year!", "positive"), |
| 164 | ("Loved it! Great storytelling and beautiful cinematography.", "positive"), |
| 165 | ("Excellent movie with outstanding performances. A must watch!", "positive"), |
| 166 | ("Amazing! This film exceeded all my expectations. Truly remarkable.", "positive"), |
| 167 | ("Wonderful experience! The plot was engaging and entertaining.", "positive"), |
| 168 | ("Superb direction and acting! One of the best films I've seen.", "positive"), |
| 169 | |
| 170 | # Negative reviews |
| 171 | ("Terrible movie. Waste of time and money. Very disappointed.", "negative"), |
| 172 | ("Awful film! Poor acting and boring story. Would not recommend.", "negative"), |
| 173 | ("Horrible! The worst movie I have ever seen. Extremely disappointing.", "negative"), |
| 174 | ("Bad movie with terrible plot. Boring and predictable.", "negative"), |
| 175 | ("Disappointing film. Poor execution and weak performances.", "negative"), |
| 176 | ("Worst movie ever! Horrible acting and stupid storyline.", "negative"), |
| 177 | ("Terrible experience. Boring and poorly made. Don't waste your time.", "negative"), |
| 178 | ("Awful! Poor quality and uninteresting. Complete waste of time.", "negative"), |
| 179 | ] |
| 180 | |
| 181 | |
| 182 | def main(): |
| 183 | """ |
| 184 | Main function - Let's analyze some sentiments! |
| 185 | """ |
| 186 | print("=" * 70) |
| 187 | print("Simple Text Sentiment Analysis") |
| 188 | print("=" * 70) |
| 189 | print("\n๐ Task: Learn to identify positive and negative movie reviews") |
| 190 | print() |
| 191 | |
| 192 | # Step 1: Create training data |
| 193 | training_data = create_training_data() |
| 194 | print(f"๐ Training data: {len(training_data)} movie reviews") |
| 195 | print() |
| 196 | |
| 197 | # Step 2: Create and train analyzer |
| 198 | analyzer = SimpleSentimentAnalyzer() |
| 199 | analyzer.train(training_data) |
| 200 | print() |
| 201 | |
| 202 | # Step 3: Test on new reviews |
| 203 | print("๐งช Testing on new movie reviews:") |
| 204 | print("=" * 70) |
| 205 | |
| 206 | test_reviews = [ |
| 207 | "This movie was fantastic! I really enjoyed it.", |
| 208 | "Boring and terrible. Not worth watching.", |
| 209 | "Amazing cinematography and wonderful acting!", |
| 210 | "The worst film I've seen this year. Awful.", |
| 211 | "Pretty good movie with some great moments.", |
| 212 | "Disappointing and poorly directed.", |
| 213 | ] |
| 214 | |
| 215 | for i, review in enumerate(test_reviews, 1): |
| 216 | sentiment, confidence, score = analyzer.analyze(review) |
| 217 | |
| 218 | # Visual indicator |
| 219 | indicator = "๐" if sentiment == "positive" else "๐" |
| 220 | |
| 221 | print(f"\nReview {i}:") |
| 222 | print(f" Text: \"{review}\"") |
| 223 | print(f" {indicator} Sentiment: {sentiment.upper()}") |
| 224 | print(f" ๐ Confidence: {confidence:.1f}%") |
| 225 | print(f" ๐ Score: {score:+.3f}") |
| 226 | |
| 227 | print("\n" + "=" * 70) |
| 228 | |
| 229 | # Interactive mode |
| 230 | print("\n๐ฌ Try it yourself! Enter your own review (or 'quit' to exit):") |
| 231 | print("-" * 70) |
| 232 | |
| 233 | while True: |
| 234 | user_input = input("\nYour review: ").strip() |
| 235 | |
| 236 | if user_input.lower() in ['quit', 'exit', 'q']: |
| 237 | break |
| 238 | |
| 239 | if not user_input: |
| 240 | continue |
| 241 | |
| 242 | try: |
| 243 | sentiment, confidence, score = analyzer.analyze(user_input) |
| 244 | indicator = "๐" if sentiment == "positive" else "๐" |
| 245 | |
| 246 | print(f"\n{indicator} Sentiment: {sentiment.upper()}") |
| 247 | print(f"๐ Confidence: {confidence:.1f}%") |
| 248 | print(f"๐ Score: {score:+.3f}") |
| 249 | except Exception as e: |
| 250 | print(f"Error: {e}") |
| 251 | |
| 252 | # Explanation |
| 253 | print("\n๐ก What just happened?") |
| 254 | print("1. The analyzer learned word patterns from example reviews") |
| 255 | print("2. It calculated 'sentiment scores' for words") |
| 256 | print("3. For new text, it combines word scores to predict sentiment") |
| 257 | print() |
| 258 | print("๐ You just built a sentiment analyzer!") |
| 259 | print() |
| 260 | print("๐ Next steps:") |
| 261 | print(" - Add more training examples to improve accuracy") |
| 262 | print(" - Try analyzing tweets, product reviews, or comments") |
| 263 | print(" - Explore more advanced NLP in lessons/5-NLP/") |
| 264 | print() |
| 265 | |
| 266 | |
| 267 | if __name__ == "__main__": |
| 268 | main() |
| 269 | |