preprocessing Text Full and path

PHOTO

Wed Aug 25 2021 17:43:54 GMT+0000 (Coordinated Universal Time)

Saved by @QuinnFox12 #chinese #visualization

# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))


# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****

COPY

Save snippets that work from anywhere online with our extensions

Comments

text-preprocessing

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings apply function and def Python regex related snippets convert tra to sim chinese remove punc and stopword chinese most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra n-gram with filter POS tag Retrain spacy tagger pos working with time series snippets Extract Text From PDF with Python time in pandas dataframe python vlookup dataframe remove blank string in list check if 2 strings in string

pandas

@QuinnFox12

Combine columns in dataframe create and save dataframe to csv replace function in dataframe apply function and def convert tra to sim chinese create and save dataframe to csv remove punc and stopword chinese preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path multi txt to pandas Pandas selection iloc loc note working with time series snippets group by the first column and get second column as lists in rows: Remove blank cell in pandas dataframe time in pandas dataframe python vlookup dataframe select pandas column as dataframe instead of series dataframe all related Concate dataframe in for loop

NLP

@QuinnFox12

convert tra to sim chinese remove punc and stopword chinese Chinese POS most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra Pandas selection iloc loc note n-gram with filter POS tag Retrain spacy tagger pos Train Fasttext - GloVe algorithm on my own corpus Extract Text From PDF with Python tạo wordcloud

regex

@QuinnFox12

Python regex related snippets preprocessing Text Full and path preprocessing Text Full and path check if 2 strings in string

chinese

@QuinnFox12

remove punc and stopword chinese preprocessing Text Full and path preprocessing Text Full and path convert stopword list from sim to tra

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

#chinese #visualization

display Chinese in seaborn plot

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
import seaborn as sns
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

#chinese #visualization

most common words for each sector and visualize

type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
    label_company[index]=s
    label_other[s]=index
    index+=1


for s in type_of_companies:
    df=data_train[data_train['label'] == s]
    email=''
    for i in df.index: 
        email+=df["text"][i]
    tokenizer = RegexpTokenizer(r'\w+')
    filtered_sentence=[]
    word_tokens = tokenizer.tokenize(email)
    for w in word_tokens:
        if w.lower() not in stop:
            filtered_sentence.append(w.lower())

    fdist2 = FreqDist(filtered_sentence)
    fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))

#chinese #visualization

found at least two devices, cuda:0 and cpu - pytorch

https://github.com/SysCV/qdtrack/issues/29
https://stackoverflow.com/questions/50954479/using-cuda-with-pytorch
https://stackoverflow.com/questions/43806326/how-to-run-pytorch-on-gpu-by-default?noredirect=1&lq=1
https://colab.research.google.com/drive/1DIQm9rOx2mT1bZETEeVUThxcrP1RKqAn#scrollTo=81sghL-oijxb

#chinese #visualization

preprocessing Text Full and path

# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))


# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****

#chinese #stopwords #convertchitra

convert stopword list from sim to tra

stop_words = list(stopwords(["zh"]))
cc = OpenCC('s2t')
stop_word = []
for i in stop_words:
    text = cc.convert(i)
    stop_word.append(text)
print(stop_word)

lista   = ['请问','谢谢您','谢谢你','谢谢','谢','您好','_','喔', '意思', '午', '意', "感",'想','问']
cc = OpenCC('s2t')
stop_wordsf = []
for i in lista:
    text = cc.convert(i)
    stop_wordsf.append(text)
print(stop_wordsf)

#chinese #stopwords #convertchitra

Pandas selection iloc loc note

# selectdataframe from index from list of int
top_sim = [21, 24622, 32199, 32570, 17463]

top_sim_frame = job_vec.loc[top_simi, : ]
# --------------------

You nedd add () because & has higher precedence than ==:

df3 = df[(df['count'] == '2') & (df['price'] == '100')]
print (df3)
  id count price
0  1     2   100

If need check multiple values use isin:

df4 = df[(df['count'].isin(['2','7'])) & (df['price'].isin(['100', '221']))]
print (df4)
  id count price
0  1     2   100
3  4     7   221

But if check numeric, use:

df3 = df[(df['count'] == 2) & (df['price'] == 100)]
print (df3)

df4 = df[(df['count'].isin([2,7])) & (df['price'].isin([100, 221]))]
print (df4)

#chinese #stopwords #convertchitra

Print function

for idx in range(num):
    # Print the first 16 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 6))

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp #ocr #chinese #japanese #handwritten

209-handwritten-ocr: Handwritten Chinese and Japanese OCR

# Imports
from collections import namedtuple
from itertools import groupby
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.runtime import Core

# Settings
# Directories where data will be placed
model_folder = "model"
data_folder = "data"
charlist_folder = f"{data_folder}/charlists"

# Precision used by model
precision = "FP16"

Language = namedtuple(
    typename="Language", field_names=["model_name", "charlist_name", "demo_image_name"]
)
chinese_files = Language(
    model_name="handwritten-simplified-chinese-recognition-0001",
    charlist_name="chinese_charlist.txt",
    demo_image_name="handwritten_chinese_test.jpg",
)
japanese_files = Language(
    model_name="handwritten-japanese-recognition-0001",
    charlist_name="japanese_charlist.txt",
    demo_image_name="handwritten_japanese_test.png",
)

# Select Language
# Select language by using either language='chinese' or language='japanese'
language = "chinese"

languages = {"chinese": chinese_files, "japanese": japanese_files}

selected_language = languages.get(language)

# Download Model
path_to_model_weights = Path(f'{model_folder}/intel/{selected_language.model_name}/{precision}/{selected_language.model_name}.bin')
if not path_to_model_weights.is_file():
    download_command = f'omz_downloader --name {selected_language.model_name} --output_dir {model_folder} --precision {precision}'
    print(download_command)
    ! $download_command

# Load Network and Execute
ie = Core()
path_to_model = path_to_model_weights.with_suffix(".xml")
model = ie.read_model(model=path_to_model)

# Select Device Name
# To check available device names run the line below
# print(ie.available_devices)

compiled_model = ie.compile_model(model=model, device_name="CPU")

# Fetch Information About Input and Output Layers
recognition_output_layer = compiled_model.output(0)
recognition_input_layer = compiled_model.input(0)

# Load an Image
# Read file name of demo file based on the selected model

file_name = selected_language.demo_image_name

# Text detection models expects an image in grayscale format
# IMPORTANT!!! This model allows to read only one line at time

# Read image
image = cv2.imread(filename=f"{data_folder}/{file_name}", flags=cv2.IMREAD_GRAYSCALE)

# Fetch shape
image_height, _ = image.shape

# B,C,H,W = batch size, number of channels, height, width
_, _, H, W = recognition_input_layer.shape

# Calculate scale ratio between input shape height and image height to resize image
scale_ratio = H / image_height

# Resize image to expected input sizes
resized_image = cv2.resize(
    image, None, fx=scale_ratio, fy=scale_ratio, interpolation=cv2.INTER_AREA
)

# Pad image to match input size, without changing aspect ratio
resized_image = np.pad(
    resized_image, ((0, 0), (0, W - resized_image.shape[1])), mode="edge"
)

# Reshape to network the input shape
input_image = resized_image[None, None, :, :]

# Visualise Input Image
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255);

# Prepare Charlist
# Get dictionary to encode output, based on model documentation
used_charlist = selected_language.charlist_name

# With both models, there should be blank symbol added at index 0 of each charlist
blank_char = "~"

with open(f"{charlist_folder}/{used_charlist}", "r", encoding="utf-8") as charlist:
    letters = blank_char + "".join(line.strip() for line in charlist)

# Run Inference
# Run inference on the model
predictions = compiled_model([input_image])[recognition_output_layer]

# Process Output Data
# Remove batch dimension
predictions = np.squeeze(predictions)

# Run argmax to pick the symbols with the highest probability
predictions_indexes = np.argmax(predictions, axis=1)

# Use groupby to remove concurrent letters, as required by CTC greedy decoding
output_text_indexes = list(groupby(predictions_indexes))

# Remove grouper objects
output_text_indexes, _ = np.transpose(output_text_indexes, (1, 0))

# Remove blank symbols
output_text_indexes = output_text_indexes[output_text_indexes != 0]

# Assign letters to indexes from output array
output_text = [letters[letter_index] for letter_index in output_text_indexes]

# Print Output
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255)

print("".join(output_text))

#r #ggplot2 #visualization

ggplot2 graph template

ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))

#html #pandas #python #visualization

Display two Dataframes side by side

import numpy as np
import pandas as pd   
from IPython.display import display_html 

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])

df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Caption table 1')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Caption table 2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

##python #coding #gui #audio #visualization #fft

Visual Symphony Python’s Real-Time Audio Visualization Magic Using Microsoft Copilot’s 🎵🖥️

import pyaudio
import numpy as np
import pyqtgraph as pg

# Initialize PyAudio
pa = pyaudio.PyAudio()

# Set up audio stream
stream = pa.open(
    format=pyaudio.paInt16,
    channels=1,
    rate=44100,
    input=True,
    frames_per_buffer=1024
)

# Set up PyQTGraph
app = pg.mkQApp()
win = pg.GraphicsLayoutWidget()
win.show()
plot = win.addPlot(title='Real-time Audio Waveform')
curve = plot.plot()

# Function to update the plot
def update():
    wf_data = np.frombuffer(stream.read(1024), dtype=np.int16)
    curve.setData(wf_data)

# Start the audio stream
timer = pg.QtCore.QTimer()
timer.timeout.connect(update)
timer.start(50)

# Start Qt event loop
pg.mkQApp().exec()

preprocessing Text Full and path

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

pandas

NLP

regex

chinese

Browse more snippets >>

preprocessing Text Full and path

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

pandas

NLP

regex

chinese

Browse more snippets >>

Embed code snippet