Snippets Collections
# Imports
from collections import namedtuple
from itertools import groupby
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.runtime import Core

# Settings
# Directories where data will be placed
model_folder = "model"
data_folder = "data"
charlist_folder = f"{data_folder}/charlists"

# Precision used by model
precision = "FP16"

Language = namedtuple(
    typename="Language", field_names=["model_name", "charlist_name", "demo_image_name"]
)
chinese_files = Language(
    model_name="handwritten-simplified-chinese-recognition-0001",
    charlist_name="chinese_charlist.txt",
    demo_image_name="handwritten_chinese_test.jpg",
)
japanese_files = Language(
    model_name="handwritten-japanese-recognition-0001",
    charlist_name="japanese_charlist.txt",
    demo_image_name="handwritten_japanese_test.png",
)

# Select Language
# Select language by using either language='chinese' or language='japanese'
language = "chinese"

languages = {"chinese": chinese_files, "japanese": japanese_files}

selected_language = languages.get(language)

# Download Model
path_to_model_weights = Path(f'{model_folder}/intel/{selected_language.model_name}/{precision}/{selected_language.model_name}.bin')
if not path_to_model_weights.is_file():
    download_command = f'omz_downloader --name {selected_language.model_name} --output_dir {model_folder} --precision {precision}'
    print(download_command)
    ! $download_command

# Load Network and Execute
ie = Core()
path_to_model = path_to_model_weights.with_suffix(".xml")
model = ie.read_model(model=path_to_model)

# Select Device Name
# To check available device names run the line below
# print(ie.available_devices)

compiled_model = ie.compile_model(model=model, device_name="CPU")

# Fetch Information About Input and Output Layers
recognition_output_layer = compiled_model.output(0)
recognition_input_layer = compiled_model.input(0)

# Load an Image
# Read file name of demo file based on the selected model

file_name = selected_language.demo_image_name

# Text detection models expects an image in grayscale format
# IMPORTANT!!! This model allows to read only one line at time

# Read image
image = cv2.imread(filename=f"{data_folder}/{file_name}", flags=cv2.IMREAD_GRAYSCALE)

# Fetch shape
image_height, _ = image.shape

# B,C,H,W = batch size, number of channels, height, width
_, _, H, W = recognition_input_layer.shape

# Calculate scale ratio between input shape height and image height to resize image
scale_ratio = H / image_height

# Resize image to expected input sizes
resized_image = cv2.resize(
    image, None, fx=scale_ratio, fy=scale_ratio, interpolation=cv2.INTER_AREA
)

# Pad image to match input size, without changing aspect ratio
resized_image = np.pad(
    resized_image, ((0, 0), (0, W - resized_image.shape[1])), mode="edge"
)

# Reshape to network the input shape
input_image = resized_image[None, None, :, :]

# Visualise Input Image
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255);

# Prepare Charlist
# Get dictionary to encode output, based on model documentation
used_charlist = selected_language.charlist_name

# With both models, there should be blank symbol added at index 0 of each charlist
blank_char = "~"

with open(f"{charlist_folder}/{used_charlist}", "r", encoding="utf-8") as charlist:
    letters = blank_char + "".join(line.strip() for line in charlist)

# Run Inference
# Run inference on the model
predictions = compiled_model([input_image])[recognition_output_layer]

# Process Output Data
# Remove batch dimension
predictions = np.squeeze(predictions)

# Run argmax to pick the symbols with the highest probability
predictions_indexes = np.argmax(predictions, axis=1)

# Use groupby to remove concurrent letters, as required by CTC greedy decoding
output_text_indexes = list(groupby(predictions_indexes))

# Remove grouper objects
output_text_indexes, _ = np.transpose(output_text_indexes, (1, 0))

# Remove blank symbols
output_text_indexes = output_text_indexes[output_text_indexes != 0]

# Assign letters to indexes from output array
output_text = [letters[letter_index] for letter_index in output_text_indexes]

# Print Output
plt.figure(figsize=(20, 1))
plt.axis("off")
plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255)

print("".join(output_text))
for idx in range(num):
    # Print the first 16 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 6))
# selectdataframe from index from list of int
top_sim = [21, 24622, 32199, 32570, 17463]

top_sim_frame = job_vec.loc[top_simi, : ]
# --------------------

You nedd add () because & has higher precedence than ==:

df3 = df[(df['count'] == '2') & (df['price'] == '100')]
print (df3)
  id count price
0  1     2   100

If need check multiple values use isin:

df4 = df[(df['count'].isin(['2','7'])) & (df['price'].isin(['100', '221']))]
print (df4)
  id count price
0  1     2   100
3  4     7   221

But if check numeric, use:

df3 = df[(df['count'] == 2) & (df['price'] == 100)]
print (df3)

df4 = df[(df['count'].isin([2,7])) & (df['price'].isin([100, 221]))]
print (df4)

stop_words = list(stopwords(["zh"]))
cc = OpenCC('s2t')
stop_word = []
for i in stop_words:
    text = cc.convert(i)
    stop_word.append(text)
print(stop_word)

lista   = ['请问','谢谢您','谢谢你','谢谢','谢','您好','_','喔', '意思', '午', '意', "感",'想','问']
cc = OpenCC('s2t')
stop_wordsf = []
for i in lista:
    text = cc.convert(i)
    stop_wordsf.append(text)
print(stop_wordsf)
# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'<', '', text) #remove '<' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))


# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****
https://github.com/SysCV/qdtrack/issues/29
https://stackoverflow.com/questions/50954479/using-cuda-with-pytorch
https://stackoverflow.com/questions/43806326/how-to-run-pytorch-on-gpu-by-default?noredirect=1&lq=1
https://colab.research.google.com/drive/1DIQm9rOx2mT1bZETEeVUThxcrP1RKqAn#scrollTo=81sghL-oijxb
type_of_label = set(data_train['label'])
# stop = stopwords.words('english')
# stop.append("the")
# stop.append("company")
# stop_words=set(stop)
label_company = dict()
label_other = dict()
index = 0
for s in type_of_companies:
    label_company[index]=s
    label_other[s]=index
    index+=1


for s in type_of_companies:
    df=data_train[data_train['label'] == s]
    email=''
    for i in df.index: 
        email+=df["text"][i]
    tokenizer = RegexpTokenizer(r'\w+')
    filtered_sentence=[]
    word_tokens = tokenizer.tokenize(email)
    for w in word_tokens:
        if w.lower() not in stop:
            filtered_sentence.append(w.lower())

    fdist2 = FreqDist(filtered_sentence)
    fdist2.plot(10,cumulative=False,title='Frequency for '+str(s))
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
import seaborn as sns
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})
# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()!,❤。~《》:()【】「」?”“;:、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)
star

Fri Jun 17 2022 05:10:39 GMT+0000 (Coordinated Universal Time) https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/209-handwritten-ocr/209-handwritten-ocr.ipynb

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp #ocr #chinese #japanese #handwritten
star

Wed Oct 13 2021 07:27:39 GMT+0000 (Coordinated Universal Time)

#chinese #stopwords #convertchitra
star

Thu Oct 07 2021 08:48:22 GMT+0000 (Coordinated Universal Time) https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/

#chinese #stopwords #convertchitra
star

Thu Sep 30 2021 05:49:49 GMT+0000 (Coordinated Universal Time)

#chinese #stopwords #convertchitra
star

Wed Aug 25 2021 17:43:54 GMT+0000 (Coordinated Universal Time)

#chinese #visualization
star

Sun Aug 22 2021 20:03:45 GMT+0000 (Coordinated Universal Time)

#chinese #visualization
star

Sun Aug 22 2021 18:21:27 GMT+0000 (Coordinated Universal Time)

#chinese #visualization
star

Sun Aug 22 2021 18:18:53 GMT+0000 (Coordinated Universal Time) https://github.com/mwaskom/seaborn/issues/1009

#chinese #visualization
star

Fri Aug 13 2021 06:21:01 GMT+0000 (Coordinated Universal Time)

#python #pandas #column #nlp #chinese #trasim
star

Thu Aug 12 2021 07:11:15 GMT+0000 (Coordinated Universal Time)

#python #pandas #column #nlp #chinese #trasim

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension