type_of_label = set(data_train['label']) # stop = stopwords.words('english') # stop.append("the") # stop.append("company") # stop_words=set(stop) label_company = dict() label_other = dict() index = 0 for s in type_of_companies: label_company[index]=s label_other[s]=index index+=1 for s in type_of_companies: df=data_train[data_train['label'] == s] email='' for i in df.index: email+=df["text"][i] tokenizer = RegexpTokenizer(r'\w+') filtered_sentence=[] word_tokens = tokenizer.tokenize(email) for w in word_tokens: if w.lower() not in stop: filtered_sentence.append(w.lower()) fdist2 = FreqDist(filtered_sentence) fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter