type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
label_company[index]=s
label_other[s]=index
index+=1
for s in type_of_companies:
df=data_train[data_train['label'] == s]
email=''
for i in df.index:
email+=df["text"][i]
tokenizer = RegexpTokenizer(r'\w+')
filtered_sentence=[]
word_tokens = tokenizer.tokenize(email)
for w in word_tokens:
if w.lower() not in stop:
filtered_sentence.append(w.lower())
fdist2 = FreqDist(filtered_sentence)
fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter