WORD CLOUD MONGO

PHOTO

Tue Jun 18 2024 21:36:14 GMT+0000 (Coordinated Universal Time)

from deep_translator import GoogleTranslator
from pymongo import MongoClient
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# MongoDB connection details
mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"
collection_name = "final"

# Connect to MongoDB
client = MongoClient(mongo_uri)
db = client[database_name]
collection = db[collection_name]

# Function to translate text to English
def translate_to_english(text, lang):
    if lang != 'en':
        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
    else:
        translated_text = text
    return translated_text

# Fetch text data from MongoDB
texts = []
cursor = collection.find().limit(100)

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Process each document fetched from MongoDB
all_text = ""
for doc in cursor:
    text = doc.get("json_data.text")
    lang = doc.get("json_data.lang")
    text = translate_to_english(text, lang)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    
    # Join tokens into a single string
    processed_text = " ".join(tokens)
    
    # Accumulate all text for generating word cloud
    all_text += processed_text + " "

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

# Plotting the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Text Field in Tweets')
plt.show()

COPY