Tokenization etc for sentiment with example
Fri May 31 2024 18:16:47 GMT+0000 (Coordinated Universal Time)
Saved by @madgakantara
from pymongo import MongoClient, errors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
# MongoDB connection settings
mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"
source_collection_name = "tweets"
# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def get_text_from_doc(doc):
"""
Extracts the text from the document.
Returns the text if available, otherwise returns None.
"""
text = doc.get("json_data", {}).get("text")
if text:
return text
extended_text = doc.get("json_data", {}).get("extended_tweet", {}).get("full_text")
return extended_text
def preprocess_text(text):
# Remove punctuation
tokens = [token for token in tokens if token not in string.punctuation]
# Remove stopwords
stop_words = set(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('dutch'))
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return tokens
def analyze_sentiment(text):
"""
Analyzes the sentiment of the given text using VADER.
Returns the compound sentiment score.
"""
analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(text)
return sentiment['compound']
try:
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
db = client[database_name]
source_collection = db[source_collection_name]
cursor = source_collection.find().limit(20) # Limit to only 20 tweets
for doc in cursor:
text = get_text_from_doc(doc)
if text:
preprocessed_text = preprocess_text(text)
processed_text = ' '.join(preprocessed_text) # Convert preprocessed tokens back to text
print("Original Text:", text)
print("Processed Text:", processed_text)
print("Sentiment Score:", analyze_sentiment(processed_text))
print()
except errors.ServerSelectionTimeoutError as err:
print("Failed to connect to MongoDB server:", err)
except errors.PyMongoError as err:
print("An error occurred while working with MongoDB:", err)
finally:
client.close()



Comments