Tokenization etc for sentiment with example

PHOTO

Fri May 31 2024 18:16:47 GMT+0000 (Coordinated Universal Time)

from pymongo import MongoClient, errors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# MongoDB connection settings
mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"
source_collection_name = "tweets"

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def get_text_from_doc(doc):
    """
    Extracts the text from the document.
    Returns the text if available, otherwise returns None.
    """
    text = doc.get("json_data", {}).get("text")
    if text:
        return text
    extended_text = doc.get("json_data", {}).get("extended_tweet", {}).get("full_text")
    return extended_text

def preprocess_text(text):
  
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('dutch'))    
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

def analyze_sentiment(text):
    """
    Analyzes the sentiment of the given text using VADER.
    Returns the compound sentiment score.
    """
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

try:
    client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
    db = client[database_name]
    source_collection = db[source_collection_name]

    cursor = source_collection.find().limit(20)  # Limit to only 20 tweets

    for doc in cursor:
        text = get_text_from_doc(doc)
        if text:
            preprocessed_text = preprocess_text(text)
            processed_text = ' '.join(preprocessed_text)  # Convert preprocessed tokens back to text
            print("Original Text:", text)
            print("Processed Text:", processed_text)
            print("Sentiment Score:", analyze_sentiment(processed_text))
            print()

except errors.ServerSelectionTimeoutError as err:
    print("Failed to connect to MongoDB server:", err)
except errors.PyMongoError as err:
    print("An error occurred while working with MongoDB:", err)
finally:
    client.close()

COPY