tạo wordcloud

PHOTO

Sat Mar 04 2023 02:21:44 GMT+0000 (Coordinated Universal Time)

# tokenize your string with nltk
def tokenize_text(text: str):
    
    # import needed packages
    import nltk
    import re
    
    # remove unwanted new line and tab characters from the text
    for char in ["\n", "\r", "\d", "\t"]:
        text = text.replace(char, " ")
    
    # lowercase the text
    text = text.lower()
    
    # remove punctuation from text
    text = re.sub(r"[^\w\s]", "", text)
    
    # tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # remove stopwords from txt_tokens and word_tokens
    from nltk.corpus import stopwords
    english_stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in english_stop_words]
    
    # return your tokens
    return tokens
  
# lemmatize your tokens with nltk
def lemmatize_tokens(tokens):
    
    # import needed packages
    import nltk
    nltk.download('wordnet')
    from nltk.stem import WordNetLemmatizer
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # return your lemmatized tokens
    return lemmatized_tokens

# create a wordcloud
def flexbile_wordcloud_function(text: str,
                                output_filepath: str,
                                mask_path = None,
                                white_mask_background = True,
                                width = 725,
                                height = 300,
                                background_color = "white",
                                colormap = "viridis",
                                contour_color = "steelblue",
                                contour_width = 3,
                                collocations = False,
                                max_words = 2000,
                                max_font_size = 40,
                                min_font_size = 4,
                                prefer_horizontal = 0.9,
                                include_numbers = True):
    
    # start function timer
    import time
    start = time.time()
    
    # tokenize and lemmatize your text
    tokens = tokenize_text(text = text)
    lemmatized_tokens = lemmatize_tokens(tokens = tokens)
    
    # import needed packages
    from wordcloud import WordCloud
    from PIL import Image
    import matplotlib.pyplot as plt
    import numpy as np
    
    # create a wordcloud object without a mask
    if mask_path == None:
    
        # create a WordCloud object
        wordcloud = WordCloud(width = width,
                              height = height,
                              background_color = background_color,
                              colormap = colormap,
                              collocations = collocations,
                              max_words = max_words,
                              max_font_size = max_font_size,
                              min_font_size = min_font_size,
                              prefer_horizontal = prefer_horizontal,
                              include_numbers = include_numbers)
    
    # create a wordcloud object with a mask image
    elif mask_path != None:
        
        # open the mask image as a numpy array
        mask = np.array(Image.open(mask_path))
        
        # if your mask has a black background update to white
        if white_mask_background == False:
            mask[mask[:, :] == 0] = 255
        
        # create a WordCloud object
        wordcloud = WordCloud(mask = mask,
                              width=mask.shape[1],
                              height=mask.shape[0],
                              background_color = background_color,
                              colormap = colormap,
                              contour_color = contour_color,
                              contour_width = contour_width,
                              collocations = collocations,
                              max_words = max_words,
                              max_font_size = max_font_size,
                              min_font_size = min_font_size,
                              prefer_horizontal = prefer_horizontal,
                              include_numbers = include_numbers)

    # generate a word cloud (must join the tokens into a string)
    wordcloud.generate(','.join(lemmatized_tokens))

    # end wordcloud timer
    end = time.time()
    print(f"wordcloud created in {round(end-start, 1)} seconds")
    
    # print, save, and return the wordcloud
    plt.imshow(wordcloud)
    wordcloud.to_file(output_filepath)
    return wordcloud.to_image()

COPY