most common words for each sector and visualize

PHOTO

Sun Aug 22 2021 18:21:27 GMT+0000 (Coordinated Universal Time)

Saved by @QuinnFox12 #chinese #visualization

type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
    label_company[index]=s
    label_other[s]=index
    index+=1


for s in type_of_companies:
    df=data_train[data_train['label'] == s]
    email=''
    for i in df.index: 
        email+=df["text"][i]
    tokenizer = RegexpTokenizer(r'\w+')
    filtered_sentence=[]
    word_tokens = tokenizer.tokenize(email)
    for w in word_tokens:
        if w.lower() not in stop:
            filtered_sentence.append(w.lower())

    fdist2 = FreqDist(filtered_sentence)
    fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))

COPY

Each type of company is defined by a number and stored in a dictionary. This will be more useful when learning. Now we want to see if for each sector of activity there are words that are frequent in their description. We will therefore count the most common words for each sector and visualize them using a curve. We choose to focus on the 10 most important words each time to have a sufficient sample, but this choice is arbitrary.

Save snippets that work from anywhere online with our extensions

Comments

text-preprocessing

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings apply function and def Python regex related snippets convert tra to sim chinese remove punc and stopword chinese most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra n-gram with filter POS tag Retrain spacy tagger pos working with time series snippets Extract Text From PDF with Python time in pandas dataframe python vlookup dataframe remove blank string in list check if 2 strings in string

NLP

@QuinnFox12

convert tra to sim chinese remove punc and stopword chinese Chinese POS most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra Pandas selection iloc loc note n-gram with filter POS tag Retrain spacy tagger pos Train Fasttext - GloVe algorithm on my own corpus Extract Text From PDF with Python tạo wordcloud

visualization

@QuinnFox12

display Chinese in seaborn plot most common words for each sector and visualize

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

#chinese #visualization

display Chinese in seaborn plot

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
import seaborn as sns
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

#chinese #visualization

most common words for each sector and visualize

type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
    label_company[index]=s
    label_other[s]=index
    index+=1


for s in type_of_companies:
    df=data_train[data_train['label'] == s]
    email=''
    for i in df.index: 
        email+=df["text"][i]
    tokenizer = RegexpTokenizer(r'\w+')
    filtered_sentence=[]
    word_tokens = tokenizer.tokenize(email)
    for w in word_tokens:
        if w.lower() not in stop:
            filtered_sentence.append(w.lower())

    fdist2 = FreqDist(filtered_sentence)
    fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))

#chinese #visualization

found at least two devices, cuda:0 and cpu - pytorch

https://github.com/SysCV/qdtrack/issues/29
https://stackoverflow.com/questions/50954479/using-cuda-with-pytorch
https://stackoverflow.com/questions/43806326/how-to-run-pytorch-on-gpu-by-default?noredirect=1&lq=1
https://colab.research.google.com/drive/1DIQm9rOx2mT1bZETEeVUThxcrP1RKqAn#scrollTo=81sghL-oijxb

#chinese #visualization

preprocessing Text Full and path

# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))


# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****

#chinese #stopwords #convertchitra

convert stopword list from sim to tra

stop_words = list(stopwords(["zh"]))
cc = OpenCC('s2t')
stop_word = []
for i in stop_words:
    text = cc.convert(i)
    stop_word.append(text)
print(stop_word)

lista   = ['请问','谢谢您','谢谢你','谢谢','谢','您好','_','喔', '意思', '午', '意', "感",'想','问']
cc = OpenCC('s2t')
stop_wordsf = []
for i in lista:
    text = cc.convert(i)
    stop_wordsf.append(text)
print(stop_wordsf)

#chinese #stopwords #convertchitra

Pandas selection iloc loc note

# selectdataframe from index from list of int
top_sim = [21, 24622, 32199, 32570, 17463]

top_sim_frame = job_vec.loc[top_simi, : ]
# --------------------

You nedd add () because & has higher precedence than ==:

df3 = df[(df['count'] == '2') & (df['price'] == '100')]
print (df3)
  id count price
0  1     2   100

If need check multiple values use isin:

df4 = df[(df['count'].isin(['2','7'])) & (df['price'].isin(['100', '221']))]
print (df4)
  id count price
0  1     2   100
3  4     7   221

But if check numeric, use:

df3 = df[(df['count'] == 2) & (df['price'] == 100)]
print (df3)

df4 = df[(df['count'].isin([2,7])) & (df['price'].isin([100, 221]))]
print (df4)

#chinese #stopwords #convertchitra

Print function

for idx in range(num):
    # Print the first 16 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 6))

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp #ocr #chinese #japanese #handwritten

209-handwritten-ocr: Handwritten Chinese and Japanese OCR

# Imports
from collections import namedtuple
from itertools import groupby
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.runtime import Core

# Settings
# Directories where data will be placed
model_folder = "model"
data_folder = "data"
charlist_folder = f"{data_folder}/charlists"

# Precision used by model
precision = "FP16"

Language = namedtuple(
    typename="Language", field_names=["model_name", "charlist_name", "demo_image_name"]
)
chinese_files = Language(
    model_name="handwritten-simplified-chinese-recognition-0001",
    charlist_name="chinese_charlist.txt",
    demo_image_name="handwritten_chinese_test.jpg",
)
japanese_files = Language(
    model_name="handwritten-japanese-recognition-0001",
    charlist_name="japanese_charlist.txt",
    demo_image_name="handwritten_japanese_test.png",
)

# Select Language
# Select language by using either language='chinese' or language='japanese'
language = "chinese"

languages = {"chinese": chinese_files, "japanese": japanese_files}

selected_language = languages.get(language)

# Download Model
path_to_model_weights = Path(f'{model_folder}/intel/{selected_language.model_name}/{precision}/{selected_language.model_name}.bin')
if not path_to_model_weights.is_file():
    download_command = f'omz_downloader --name {selected_language.model_name} --output_dir {model_folder} --precision {precision}'
    print(download_command)
    ! $download_command

# Load Network and Execute
ie = Core()
path_to_model = path_to_model_weights.with_suffix(".xml")
model = ie.read_model(model=path_to_model)

# Select Device Name
# To check available device names run the line below
# print(ie.available_devices)

compiled_model = ie.compile_model(model=model, device_name="CPU")

# Fetch Information About Input and Output Layers
recognition_output_layer = compiled_model.output(0)
recognition_input_layer = compiled_model.input(0)

# Load an Image
# Read file name of demo file based on the selected model

file_name = selected_language.demo_image_name

# Text detection models expects an image in grayscale format
# IMPORTANT!!! This model allows to read only one line at time

# Read image
image = cv2.imread(filename=f"{data_folder}/{file_name}", flags=cv2.IMREAD_GRAYSCALE)

# Fetch shape
image_height, _ = image.shape

# B,C,H,W = batch size, number of channels, height, width
_, _, H, W = recognition_input_layer.shape

# Calculate scale ratio between input shape height and image height to resize image
scale_ratio = H / image_height

# Resize image to expected input sizes
resized_image = cv2.resize(
    image, None, fx=scale_ratio, fy=scale_ratio, interpolation=cv2.INTER_AREA
)

# Pad image to match input size, without changing aspect ratio
resized_image = np.pad(
    resized_image, ((0, 0), (0, W - resized_image.shape[1])), mode="edge"
)

# Reshape to network the input shape
input_image = resized_image[None, None, :, :]

# Visualise Input Image
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255);

# Prepare Charlist
# Get dictionary to encode output, based on model documentation
used_charlist = selected_language.charlist_name

# With both models, there should be blank symbol added at index 0 of each charlist
blank_char = "~"

with open(f"{charlist_folder}/{used_charlist}", "r", encoding="utf-8") as charlist:
    letters = blank_char + "".join(line.strip() for line in charlist)

# Run Inference
# Run inference on the model
predictions = compiled_model([input_image])[recognition_output_layer]

# Process Output Data
# Remove batch dimension
predictions = np.squeeze(predictions)

# Run argmax to pick the symbols with the highest probability
predictions_indexes = np.argmax(predictions, axis=1)

# Use groupby to remove concurrent letters, as required by CTC greedy decoding
output_text_indexes = list(groupby(predictions_indexes))

# Remove grouper objects
output_text_indexes, _ = np.transpose(output_text_indexes, (1, 0))

# Remove blank symbols
output_text_indexes = output_text_indexes[output_text_indexes != 0]

# Assign letters to indexes from output array
output_text = [letters[letter_index] for letter_index in output_text_indexes]

# Print Output
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255)

print("".join(output_text))

#r #ggplot2 #visualization

ggplot2 graph template

ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))

#html #pandas #python #visualization

Display two Dataframes side by side

import numpy as np
import pandas as pd   
from IPython.display import display_html 

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])

df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Caption table 1')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Caption table 2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

##python #coding #gui #audio #visualization #fft

Visual Symphony Python’s Real-Time Audio Visualization Magic Using Microsoft Copilot’s 🎵🖥️

import pyaudio
import numpy as np
import pyqtgraph as pg

# Initialize PyAudio
pa = pyaudio.PyAudio()

# Set up audio stream
stream = pa.open(
    format=pyaudio.paInt16,
    channels=1,
    rate=44100,
    input=True,
    frames_per_buffer=1024
)

# Set up PyQTGraph
app = pg.mkQApp()
win = pg.GraphicsLayoutWidget()
win.show()
plot = win.addPlot(title='Real-time Audio Waveform')
curve = plot.plot()

# Function to update the plot
def update():
    wf_data = np.frombuffer(stream.read(1024), dtype=np.int16)
    curve.setData(wf_data)

# Start the audio stream
timer = pg.QtCore.QTimer()
timer.timeout.connect(update)
timer.start(50)

# Start Qt event loop
pg.mkQApp().exec()

most common words for each sector and visualize

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

NLP

visualization

Browse more snippets >>

most common words for each sector and visualize

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

NLP

visualization

Browse more snippets >>

Embed code snippet