# tokenize your string with nltk def tokenize_text(text: str): # import needed packages import nltk import re # remove unwanted new line and tab characters from the text for char in ["\n", "\r", "\d", "\t"]: text = text.replace(char, " ") # lowercase the text text = text.lower() # remove punctuation from text text = re.sub(r"[^\w\s]", "", text) # tokenize the text tokens = nltk.word_tokenize(text) # remove stopwords from txt_tokens and word_tokens from nltk.corpus import stopwords english_stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in english_stop_words] # return your tokens return tokens # lemmatize your tokens with nltk def lemmatize_tokens(tokens): # import needed packages import nltk nltk.download('wordnet') from nltk.stem import WordNetLemmatizer # initiate lemmatizer lemmatizer = WordNetLemmatizer() # lemmatize tokens lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens] # return your lemmatized tokens return lemmatized_tokens # create a wordcloud def flexbile_wordcloud_function(text: str, output_filepath: str, mask_path = None, white_mask_background = True, width = 725, height = 300, background_color = "white", colormap = "viridis", contour_color = "steelblue", contour_width = 3, collocations = False, max_words = 2000, max_font_size = 40, min_font_size = 4, prefer_horizontal = 0.9, include_numbers = True): # start function timer import time start = time.time() # tokenize and lemmatize your text tokens = tokenize_text(text = text) lemmatized_tokens = lemmatize_tokens(tokens = tokens) # import needed packages from wordcloud import WordCloud from PIL import Image import matplotlib.pyplot as plt import numpy as np # create a wordcloud object without a mask if mask_path == None: # create a WordCloud object wordcloud = WordCloud(width = width, height = height, background_color = background_color, colormap = colormap, collocations = collocations, max_words = max_words, max_font_size = max_font_size, min_font_size = min_font_size, prefer_horizontal = prefer_horizontal, include_numbers = include_numbers) # create a wordcloud object with a mask image elif mask_path != None: # open the mask image as a numpy array mask = np.array(Image.open(mask_path)) # if your mask has a black background update to white if white_mask_background == False: mask[mask[:, :] == 0] = 255 # create a WordCloud object wordcloud = WordCloud(mask = mask, width=mask.shape[1], height=mask.shape[0], background_color = background_color, colormap = colormap, contour_color = contour_color, contour_width = contour_width, collocations = collocations, max_words = max_words, max_font_size = max_font_size, min_font_size = min_font_size, prefer_horizontal = prefer_horizontal, include_numbers = include_numbers) # generate a word cloud (must join the tokens into a string) wordcloud.generate(','.join(lemmatized_tokens)) # end wordcloud timer end = time.time() print(f"wordcloud created in {round(end-start, 1)} seconds") # print, save, and return the wordcloud plt.imshow(wordcloud) wordcloud.to_file(output_filepath) return wordcloud.to_image()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter