WORD CLOUD MONGO
Tue Jun 18 2024 21:36:14 GMT+0000 (Coordinated Universal Time)
Saved by
@madgakantara
from deep_translator import GoogleTranslator
from pymongo import MongoClient
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# MongoDB connection details
mongo_uri = "mongodb://localhost:27017/"
database_name = "twitter_database"
collection_name = "final"
# Connect to MongoDB
client = MongoClient(mongo_uri)
db = client[database_name]
collection = db[collection_name]
# Function to translate text to English
def translate_to_english(text, lang):
if lang != 'en':
translated_text = GoogleTranslator(source='auto', target='en').translate(text)
else:
translated_text = text
return translated_text
# Fetch text data from MongoDB
texts = []
cursor = collection.find().limit(100)
# Initialize stopwords
stop_words = set(stopwords.words('english'))
# Process each document fetched from MongoDB
all_text = ""
for doc in cursor:
text = doc.get("json_data.text")
lang = doc.get("json_data.lang")
text = translate_to_english(text, lang)
# Tokenize the text
tokens = word_tokenize(text)
# Remove punctuation and stopwords
tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
# Join tokens into a single string
processed_text = " ".join(tokens)
# Accumulate all text for generating word cloud
all_text += processed_text + " "
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
# Plotting the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Text Field in Tweets')
plt.show()
content_copyCOPY
Comments