Tokenization etc for sentiment with example
Fri May 31 2024 18:16:47 GMT+0000 (Coordinated Universal Time)
Saved by @madgakantara
from pymongo import MongoClient, errors from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import string # MongoDB connection settings mongo_uri = "mongodb://localhost:27017/" database_name = "twitter_database" source_collection_name = "tweets" # Initialize NLTK resources nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') def get_text_from_doc(doc): """ Extracts the text from the document. Returns the text if available, otherwise returns None. """ text = doc.get("json_data", {}).get("text") if text: return text extended_text = doc.get("json_data", {}).get("extended_tweet", {}).get("full_text") return extended_text def preprocess_text(text): # Remove punctuation tokens = [token for token in tokens if token not in string.punctuation] # Remove stopwords stop_words = set(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('dutch')) tokens = [token for token in tokens if token not in stop_words] # Lemmatization lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] return tokens def analyze_sentiment(text): """ Analyzes the sentiment of the given text using VADER. Returns the compound sentiment score. """ analyzer = SentimentIntensityAnalyzer() sentiment = analyzer.polarity_scores(text) return sentiment['compound'] try: client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000) db = client[database_name] source_collection = db[source_collection_name] cursor = source_collection.find().limit(20) # Limit to only 20 tweets for doc in cursor: text = get_text_from_doc(doc) if text: preprocessed_text = preprocess_text(text) processed_text = ' '.join(preprocessed_text) # Convert preprocessed tokens back to text print("Original Text:", text) print("Processed Text:", processed_text) print("Sentiment Score:", analyze_sentiment(processed_text)) print() except errors.ServerSelectionTimeoutError as err: print("Failed to connect to MongoDB server:", err) except errors.PyMongoError as err: print("An error occurred while working with MongoDB:", err) finally: client.close()
Comments