thiscodeWorks - Organizing the best of code online

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp #ocr #chinese #japanese #handwritten

209-handwritten-ocr: Handwritten Chinese and Japanese OCR

# Imports
from collections import namedtuple
from itertools import groupby
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.runtime import Core

# Settings
# Directories where data will be placed
model_folder = "model"
data_folder = "data"
charlist_folder = f"{data_folder}/charlists"

# Precision used by model
precision = "FP16"

Language = namedtuple(
    typename="Language", field_names=["model_name", "charlist_name", "demo_image_name"]
)
chinese_files = Language(
    model_name="handwritten-simplified-chinese-recognition-0001",
    charlist_name="chinese_charlist.txt",
    demo_image_name="handwritten_chinese_test.jpg",
)
japanese_files = Language(
    model_name="handwritten-japanese-recognition-0001",
    charlist_name="japanese_charlist.txt",
    demo_image_name="handwritten_japanese_test.png",
)

# Select Language
# Select language by using either language='chinese' or language='japanese'
language = "chinese"

languages = {"chinese": chinese_files, "japanese": japanese_files}

selected_language = languages.get(language)

# Download Model
path_to_model_weights = Path(f'{model_folder}/intel/{selected_language.model_name}/{precision}/{selected_language.model_name}.bin')
if not path_to_model_weights.is_file():
    download_command = f'omz_downloader --name {selected_language.model_name} --output_dir {model_folder} --precision {precision}'
    print(download_command)
    ! $download_command

# Load Network and Execute
ie = Core()
path_to_model = path_to_model_weights.with_suffix(".xml")
model = ie.read_model(model=path_to_model)

# Select Device Name
# To check available device names run the line below
# print(ie.available_devices)

compiled_model = ie.compile_model(model=model, device_name="CPU")

# Fetch Information About Input and Output Layers
recognition_output_layer = compiled_model.output(0)
recognition_input_layer = compiled_model.input(0)

# Load an Image
# Read file name of demo file based on the selected model

file_name = selected_language.demo_image_name

# Text detection models expects an image in grayscale format
# IMPORTANT!!! This model allows to read only one line at time

# Read image
image = cv2.imread(filename=f"{data_folder}/{file_name}", flags=cv2.IMREAD_GRAYSCALE)

# Fetch shape
image_height, _ = image.shape

# B,C,H,W = batch size, number of channels, height, width
_, _, H, W = recognition_input_layer.shape

# Calculate scale ratio between input shape height and image height to resize image
scale_ratio = H / image_height

# Resize image to expected input sizes
resized_image = cv2.resize(
    image, None, fx=scale_ratio, fy=scale_ratio, interpolation=cv2.INTER_AREA
)

# Pad image to match input size, without changing aspect ratio
resized_image = np.pad(
    resized_image, ((0, 0), (0, W - resized_image.shape[1])), mode="edge"
)

# Reshape to network the input shape
input_image = resized_image[None, None, :, :]

# Visualise Input Image
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255);

# Prepare Charlist
# Get dictionary to encode output, based on model documentation
used_charlist = selected_language.charlist_name

# With both models, there should be blank symbol added at index 0 of each charlist
blank_char = "~"

with open(f"{charlist_folder}/{used_charlist}", "r", encoding="utf-8") as charlist:
    letters = blank_char + "".join(line.strip() for line in charlist)

# Run Inference
# Run inference on the model
predictions = compiled_model([input_image])[recognition_output_layer]

# Process Output Data
# Remove batch dimension
predictions = np.squeeze(predictions)

# Run argmax to pick the symbols with the highest probability
predictions_indexes = np.argmax(predictions, axis=1)

# Use groupby to remove concurrent letters, as required by CTC greedy decoding
output_text_indexes = list(groupby(predictions_indexes))

# Remove grouper objects
output_text_indexes, _ = np.transpose(output_text_indexes, (1, 0))

# Remove blank symbols
output_text_indexes = output_text_indexes[output_text_indexes != 0]

# Assign letters to indexes from output array
output_text = [letters[letter_index] for letter_index in output_text_indexes]

# Print Output
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255)

print("".join(output_text))

#chinese #stopwords #convertchitra

Print function

for idx in range(num):
    # Print the first 16 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 6))

#chinese #stopwords #convertchitra

Pandas selection iloc loc note

# selectdataframe from index from list of int
top_sim = [21, 24622, 32199, 32570, 17463]

top_sim_frame = job_vec.loc[top_simi, : ]
# --------------------

You nedd add () because & has higher precedence than ==:

df3 = df[(df['count'] == '2') & (df['price'] == '100')]
print (df3)
  id count price
0  1     2   100

If need check multiple values use isin:

df4 = df[(df['count'].isin(['2','7'])) & (df['price'].isin(['100', '221']))]
print (df4)
  id count price
0  1     2   100
3  4     7   221

But if check numeric, use:

df3 = df[(df['count'] == 2) & (df['price'] == 100)]
print (df3)

df4 = df[(df['count'].isin([2,7])) & (df['price'].isin([100, 221]))]
print (df4)

#chinese #stopwords #convertchitra

convert stopword list from sim to tra

stop_words = list(stopwords(["zh"]))
cc = OpenCC('s2t')
stop_word = []
for i in stop_words:
    text = cc.convert(i)
    stop_word.append(text)
print(stop_word)

lista   = ['请问','谢谢您','谢谢你','谢谢','谢','您好','_','喔', '意思', '午', '意', "感",'想','问']
cc = OpenCC('s2t')
stop_wordsf = []
for i in lista:
    text = cc.convert(i)
    stop_wordsf.append(text)
print(stop_wordsf)

#chinese #visualization

preprocessing Text Full and path

# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))


# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****

#chinese #visualization

found at least two devices, cuda:0 and cpu - pytorch

https://github.com/SysCV/qdtrack/issues/29
https://stackoverflow.com/questions/50954479/using-cuda-with-pytorch
https://stackoverflow.com/questions/43806326/how-to-run-pytorch-on-gpu-by-default?noredirect=1&lq=1
https://colab.research.google.com/drive/1DIQm9rOx2mT1bZETEeVUThxcrP1RKqAn#scrollTo=81sghL-oijxb

#chinese #visualization

most common words for each sector and visualize

type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
    label_company[index]=s
    label_other[s]=index
    index+=1


for s in type_of_companies:
    df=data_train[data_train['label'] == s]
    email=''
    for i in df.index: 
        email+=df["text"][i]
    tokenizer = RegexpTokenizer(r'\w+')
    filtered_sentence=[]
    word_tokens = tokenizer.tokenize(email)
    for w in word_tokens:
        if w.lower() not in stop:
            filtered_sentence.append(w.lower())

    fdist2 = FreqDist(filtered_sentence)
    fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))

#chinese #visualization

display Chinese in seaborn plot

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
import seaborn as sns
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

209-handwritten-ocr: Handwritten Chinese and Japanese OCR

Print function

Pandas selection iloc loc note

convert stopword list from sim to tra

preprocessing Text Full and path

found at least two devices, cuda:0 and cpu - pytorch

most common words for each sector and visualize

display Chinese in seaborn plot

remove punc and stopword chinese

convert tra to sim chinese

Save snippets that work with our extensions