tạo wordcloud
Sat Mar 04 2023 02:21:44 GMT+0000 (Coordinated Universal Time)
Saved by @QuinnFox12
# tokenize your string with nltk
def tokenize_text(text: str):
# import needed packages
import nltk
import re
# remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
text = text.replace(char, " ")
# lowercase the text
text = text.lower()
# remove punctuation from text
text = re.sub(r"[^\w\s]", "", text)
# tokenize the text
tokens = nltk.word_tokenize(text)
# remove stopwords from txt_tokens and word_tokens
from nltk.corpus import stopwords
english_stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in english_stop_words]
# return your tokens
return tokens
# lemmatize your tokens with nltk
def lemmatize_tokens(tokens):
# import needed packages
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# initiate lemmatizer
lemmatizer = WordNetLemmatizer()
# lemmatize tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
# return your lemmatized tokens
return lemmatized_tokens
# create a wordcloud
def flexbile_wordcloud_function(text: str,
output_filepath: str,
mask_path = None,
white_mask_background = True,
width = 725,
height = 300,
background_color = "white",
colormap = "viridis",
contour_color = "steelblue",
contour_width = 3,
collocations = False,
max_words = 2000,
max_font_size = 40,
min_font_size = 4,
prefer_horizontal = 0.9,
include_numbers = True):
# start function timer
import time
start = time.time()
# tokenize and lemmatize your text
tokens = tokenize_text(text = text)
lemmatized_tokens = lemmatize_tokens(tokens = tokens)
# import needed packages
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
# create a wordcloud object without a mask
if mask_path == None:
# create a WordCloud object
wordcloud = WordCloud(width = width,
height = height,
background_color = background_color,
colormap = colormap,
collocations = collocations,
max_words = max_words,
max_font_size = max_font_size,
min_font_size = min_font_size,
prefer_horizontal = prefer_horizontal,
include_numbers = include_numbers)
# create a wordcloud object with a mask image
elif mask_path != None:
# open the mask image as a numpy array
mask = np.array(Image.open(mask_path))
# if your mask has a black background update to white
if white_mask_background == False:
mask[mask[:, :] == 0] = 255
# create a WordCloud object
wordcloud = WordCloud(mask = mask,
width=mask.shape[1],
height=mask.shape[0],
background_color = background_color,
colormap = colormap,
contour_color = contour_color,
contour_width = contour_width,
collocations = collocations,
max_words = max_words,
max_font_size = max_font_size,
min_font_size = min_font_size,
prefer_horizontal = prefer_horizontal,
include_numbers = include_numbers)
# generate a word cloud (must join the tokens into a string)
wordcloud.generate(','.join(lemmatized_tokens))
# end wordcloud timer
end = time.time()
print(f"wordcloud created in {round(end-start, 1)} seconds")
# print, save, and return the wordcloud
plt.imshow(wordcloud)
wordcloud.to_file(output_filepath)
return wordcloud.to_image()



Comments