thiscodeWorks - Organizing the best of code online

##python #coding #python #programming #nlp

How to make Sentiment Analysis usingTextBlob in Python

from textblob import TextBlob

text = 'how are you'

blob = TextBlob(text)

if blob.polarity > 0:
    print('Positive')
elif blob.polarity < 0:
    print('Negative')
else:
    print('Neutral')

#python #kaggle #nlp #arabicnlp #unicode

text_prepare

def delete_links(input_text):
    pettern  = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items()) 
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    words = [word for word in words if word.isalpha()]
    out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    wnl = nltk.WordNetLemmatizer()
    lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
    out_text = [w for w in lemmatizedTokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text

def stem_text(input_text):
    st = ISRIStemmer()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [st.stem(w) for w in tokens]
    out_text = ' '.join(out_text)
    return out_text


def text_prepare(input_text, ar_text):
    out_text = delete_links(input_text)
    out_text = delete_repeated_characters(out_text)
    out_text = clean_text(out_text)
    out_text = delete_stopwords(out_text)
    if ar_text:
        out_text = replace_letters(out_text)
        out_text = remove_vowelization(out_text)
    else:
        out_text = out_text.lower()
    return out_text

#python #kaggle #nlp #arabicnlp #unicode

remove_arabic_punctuations

ARABIC_PUNCTUATION = ':"؟!؛،,.؍, '

def remove_arabic_punctuations(text : str) -> str:
    '''
        text : ", أَهْلًا وسَهْلًا Hello 212"
        output : 
            ---> "  أَهْلًا وسَهْلًا Hello 212"
    '''
    chars = [char for char in text if (char not in ARABIC_PUNCTUATION)]
    output = ''.join(chars)
    return output

#python #kaggle #nlp #arabicnlp #unicode

remove_diacritics

def remove_diacritics(text : str) -> str:
    '''
        text : "أَهْلًا وسَهْلًا Hello 212"
        output : 
            ---> "أهلا وسهلا Hello 212"
    '''
    chars = [char for char in text if (char not in HARAKAT)]
    output = ''.join(chars)
    return output

#python #spacy #nlp #preprocessing

Utility: Preprocessing text w/ Spacy

### utlity function for pre-processing the text
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

df['preprocessed_txt'] = df['Text'].apply(preprocess)

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #nlp #entity-recognition #bert

204-named-entity-recognition: Named Entity Recognition with OpenVINO

# Imports
import time
import json

import numpy as np
import tokens_bert as tokens

from openvino.runtime import Core
from openvino.runtime import Dimension

# Download the model
# directory where model will be downloaded
base_model_dir = "model"

# desired precision
precision = "FP16-INT8"

# model name as named in Open Model Zoo
model_name = "bert-small-uncased-whole-word-masking-squad-int8-0002"

model_path = f"model/intel/{model_name}/{precision}/{model_name}.xml"
model_weights_path = f"model/intel/{model_name}/{precision}/{model_name}.bin"

download_command = f"omz_downloader " \
                   f"--name {model_name} " \
                   f"--precision {precision} " \
                   f"--output_dir {base_model_dir} " \
                   f"--cache_dir {base_model_dir}"
! $download_command

# Load the model for Entity Extraction with Dynamic Shape
# initialize inference engine
ie_core = Core()
# read the network and corresponding weights from file
model = ie_core.read_model(model=model_path, weights=model_weights_path)

# assign dynamic shapes to every input layer on the last dimension
for input_layer in model.inputs:
    input_shape = input_layer.partial_shape
    input_shape[1] = Dimension(1, 384)
    model.reshape({input_layer: input_shape})

# compile the model for the CPU
compiled_model = ie_core.compile_model(model=model, device_name="CPU")

# get input names of nodes
input_keys = list(compiled_model.inputs)

# Processing
# path to vocabulary file
vocab_file_path = "data/vocab.txt"

# create dictionary with words and their indices
vocab = tokens.load_vocab_file(vocab_file_path)

# define special tokens
cls_token = vocab["[CLS]"]
sep_token = vocab["[SEP]"]

# set a confidence score threshold
confidence_threshold = 0.4

# Preprocessing
# generator of a sequence of inputs
def prepare_input(entity_tokens, context_tokens):
    input_ids = [cls_token] + entity_tokens + [sep_token] + \
        context_tokens + [sep_token]
    # 1 for any index
    attention_mask = [1] * len(input_ids)
    # 0 for entity tokens, 1 for context part
    token_type_ids = [0] * (len(entity_tokens) + 2) + \
        [1] * (len(context_tokens) + 1)

    # create input to feed the model
    input_dict = {
        "input_ids": np.array([input_ids], dtype=np.int32),
        "attention_mask": np.array([attention_mask], dtype=np.int32),
        "token_type_ids": np.array([token_type_ids], dtype=np.int32),
    }

    # some models require additional position_ids
    if "position_ids" in [i_key.any_name for i_key in input_keys]:
        position_ids = np.arange(len(input_ids))
        input_dict["position_ids"] = np.array([position_ids], dtype=np.int32)

    return input_dict

# Postprocessing
def postprocess(output_start, output_end, entity_tokens,
                context_tokens_start_end, input_size):

    def get_score(logits):
        out = np.exp(logits)
        return out / out.sum(axis=-1)

    # get start-end scores for context
    score_start = get_score(output_start)
    score_end = get_score(output_end)

    # index of first context token in tensor
    context_start_idx = len(entity_tokens) + 2
    # index of last+1 context token in tensor
    context_end_idx = input_size - 1

    # find product of all start-end combinations to find the best one
    max_score, max_start, max_end = find_best_entity_window(
        start_score=score_start, end_score=score_end,
        context_start_idx=context_start_idx, context_end_idx=context_end_idx
    )

    # convert to context text start-end index
    max_start = context_tokens_start_end[max_start][0]
    max_end = context_tokens_start_end[max_end][1]

    return max_score, max_start, max_end


def find_best_entity_window(start_score, end_score,
                            context_start_idx, context_end_idx):
    context_len = context_end_idx - context_start_idx
    score_mat = np.matmul(
        start_score[context_start_idx:context_end_idx].reshape(
            (context_len, 1)),
        end_score[context_start_idx:context_end_idx].reshape(
            (1, context_len)),
    )
    # reset candidates with end before start
    score_mat = np.triu(score_mat)
    # reset long candidates (>16 words)
    score_mat = np.tril(score_mat, 16)
    # find the best start-end pair
    max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1])
    max_score = score_mat[max_s, max_e]

    return max_score, max_s, max_e

def get_best_entity(entity, context, vocab):
    # convert context string to tokens
    context_tokens, context_tokens_end = tokens.text_to_tokens(
        text=context.lower(), vocab=vocab)
    # convert entity string to tokens
    entity_tokens, _ = tokens.text_to_tokens(text=entity.lower(), vocab=vocab)

    network_input = prepare_input(entity_tokens, context_tokens)
    input_size = len(context_tokens) + len(entity_tokens) + 3

    # openvino inference
    output_start_key = compiled_model.output("output_s")
    output_end_key = compiled_model.output("output_e")
    result = compiled_model(network_input)

    # postprocess the result getting the score and context range for the answer
    score_start_end = postprocess(output_start=result[output_start_key][0],
                                  output_end=result[output_end_key][0],
                                  entity_tokens=entity_tokens,
                                  context_tokens_start_end=context_tokens_end,
                                  input_size=input_size)

    # return the part of the context, which is already an answer
    return context[score_start_end[1]:score_start_end[2]], score_start_end[0]

# Set the Entity Recognition Template
template = ["building", "company", "persons", "city",
            "state", "height", "floor", "address"]

def run_analyze_entities(context):
    print(f"Context: {context}\n", flush=True)

    if len(context) == 0:
        print("Error: Empty context or outside paragraphs")
        return

    if len(context) > 380:
        print("Error: The context is too long for this particular model. "
              "Try with context shorter than 380 words.")
        return

    # measure processing time
    start_time = time.perf_counter()
    extract = []
    for field in template:
        entity_to_find = field + "?"
        entity, score = get_best_entity(entity=entity_to_find,
                                        context=context,
                                        vocab=vocab)
        if score >= confidence_threshold:
            extract.append({"Entity": entity, "Type": field,
                            "Score": f"{score:.2f}"})
    end_time = time.perf_counter()
    res = {"Extraction": extract, "Time": f"{end_time - start_time:.2f}s"}
    print("\nJSON Output:")
    print(json.dumps(res, sort_keys=False, indent=4))

# Run on Simple Text
# Sample 1
source_text = "Intel Corporation is an American multinational and technology" \
    " company headquartered in Santa Clara, California."
run_analyze_entities(source_text)

# Sample 2
source_text = "Intel was founded in Mountain View, California, " \
    "in 1968 by Gordon E. Moore, a chemist, and Robert Noyce, " \
    "a physicist and co-inventor of the integrated circuit."
run_analyze_entities(source_text)

# Sample 3
source_text = "The Robert Noyce Building in Santa Clara, California, " \
    "is the headquarters for Intel Corporation. It was constructed in 1992 " \
    "and is located at 2200 Mission College Boulevard - 95054. It has an " \
    "estimated height of 22.20 meters and 6 floors above ground."
run_analyze_entities(source_text)

#python #openvino #openvino-notebooks #deeplearning #accelerated-inference #ocr #paddle-paddle #paddle-ocr #nlp

405-paddle-ocr-webcam: PaddleOCR with OpenVINO

# Imports
import sys
import os
import cv2
import numpy as np
import paddle
import math
import time
import collections
from PIL import Image
from pathlib import Path
import tarfile
import urllib.request

from openvino.runtime import Core
from IPython import display
import copy

sys.path.append("../utils")
import notebook_utils as utils
import pre_post_processing as processing

# Models for PaddleOCR
# Define the function to download text detection and recognition models from PaddleOCR resources

def run_model_download(model_url, model_file_path):
    """
    Download pre-trained models from PaddleOCR resources

    Parameters:
        model_url: url link to pre-trained models
        model_file_path: file path to store the downloaded model
    """
    model_name = model_url.split("/")[-1]
    
    if model_file_path.is_file(): 
        print("Model already exists")
    else:
        # Download the model from the server, and untar it.
        print("Downloading the pre-trained model... May take a while...")

        # create a directory
        os.makedirs("model", exist_ok=True)
        urllib.request.urlretrieve(model_url, f"model/{model_name} ")
        print("Model Downloaded")

        file = tarfile.open(f"model/{model_name} ")
        res = file.extractall("model")
        file.close()
        if not res:
            print(f"Model Extracted to {model_file_path}.")
        else:
            print("Error Extracting the model. Please check the network.")

# Download the Model for Text Detection
# Directory where model will be downloaded

det_model_url = "https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar"
det_model_file_path = Path("model/ch_ppocr_mobile_v2.0_det_infer/inference.pdmodel")

run_model_download(det_model_url, det_model_file_path)

# Load the Model for Text Detection
# initialize inference engine for text detection
core = Core()
det_model = core.read_model(model=det_model_file_path)
det_compiled_model = core.compile_model(model=det_model, device_name="CPU")

# get input and output nodes for text detection
det_input_layer = det_compiled_model.input(0)
det_output_layer = det_compiled_model.output(0)

# Download the Model for Text Recognition
rec_model_url = "https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar"
rec_model_file_path = Path("model/ch_ppocr_mobile_v2.0_rec_infer/inference.pdmodel")

run_model_download(rec_model_url, rec_model_file_path)

# Load the Model for Text Recognition with Dynamic Shape
# read the model and corresponding weights from file
rec_model = core.read_model(model=rec_model_file_path)

# assign dynamic shapes to every input layer on the last dimension
for input_layer in rec_model.inputs:
    input_shape = input_layer.partial_shape
    input_shape[3] = -1
    rec_model.reshape({input_layer: input_shape})

rec_compiled_model = core.compile_model(model=rec_model, device_name="CPU")

# get input and output nodes
rec_input_layer = rec_compiled_model.input(0)
rec_output_layer = rec_compiled_model.output(0)

# Preprocessing image functions for text detection and recognition
# Preprocess for text detection
def image_preprocess(input_image, size):
    """
    Preprocess input image for text detection

    Parameters:
        input_image: input image 
        size: value for the image to be resized for text detection model
    """
    img = cv2.resize(input_image, (size, size))
    img = np.transpose(img, [2, 0, 1]) / 255
    img = np.expand_dims(img, 0)
    # NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
    img -= img_mean
    img /= img_std
    return img.astype(np.float32)

# Preprocess for text recognition
def resize_norm_img(img, max_wh_ratio):
    """
    Resize input image for text recognition

    Parameters:
        img: bounding box image from text detection 
        max_wh_ratio: value for the resizing for text recognition model
    """
    rec_image_shape = [3, 32, 320]
    imgC, imgH, imgW = rec_image_shape
    assert imgC == img.shape[2]
    character_type = "ch"
    if character_type == "ch":
        imgW = int((32 * max_wh_ratio))
    h, w = img.shape[:2]
    ratio = w / float(h)
    if math.ceil(imgH * ratio) > imgW:
        resized_w = imgW
    else:
        resized_w = int(math.ceil(imgH * ratio))
    resized_image = cv2.resize(img, (resized_w, imgH))
    resized_image = resized_image.astype('float32')
    resized_image = resized_image.transpose((2, 0, 1)) / 255
    resized_image -= 0.5
    resized_image /= 0.5
    padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
    padding_im[:, :, 0:resized_w] = resized_image
    return padding_im


def prep_for_rec(dt_boxes, frame):
    """
    Preprocessing of the detected bounding boxes for text recognition

    Parameters:
        dt_boxes: detected bounding boxes from text detection 
        frame: original input frame 
    """
    ori_im = frame.copy()
    img_crop_list = [] 
    for bno in range(len(dt_boxes)):
        tmp_box = copy.deepcopy(dt_boxes[bno])
        img_crop = processing.get_rotate_crop_image(ori_im, tmp_box)
        img_crop_list.append(img_crop)
        
    img_num = len(img_crop_list)
    # Calculate the aspect ratio of all text bars
    width_list = []
    for img in img_crop_list:
        width_list.append(img.shape[1] / float(img.shape[0]))
    
    # Sorting can speed up the recognition process
    indices = np.argsort(np.array(width_list))
    return img_crop_list, img_num, indices


def batch_text_box(img_crop_list, img_num, indices, beg_img_no, batch_num):
    """
    Batch for text recognition

    Parameters:
        img_crop_list: processed detected bounding box images 
        img_num: number of bounding boxes from text detection
        indices: sorting for bounding boxes to speed up text recognition
        beg_img_no: the beginning number of bounding boxes for each batch of text recognition inference
        batch_num: number of images for each batch
    """
    norm_img_batch = []
    max_wh_ratio = 0
    end_img_no = min(img_num, beg_img_no + batch_num)
    for ino in range(beg_img_no, end_img_no):
        h, w = img_crop_list[indices[ino]].shape[0:2]
        wh_ratio = w * 1.0 / h
        max_wh_ratio = max(max_wh_ratio, wh_ratio)
    for ino in range(beg_img_no, end_img_no):
        norm_img = resize_norm_img(img_crop_list[indices[ino]], max_wh_ratio)
        norm_img = norm_img[np.newaxis, :]
        norm_img_batch.append(norm_img)

    norm_img_batch = np.concatenate(norm_img_batch)
    norm_img_batch = norm_img_batch.copy()
    return norm_img_batch

# Postprocessing image for text detection
def post_processing_detection(frame, det_results):
    """
    Postprocess the results from text detection into bounding boxes

    Parameters:
        frame: input image 
        det_results: inference results from text detection model
    """   
    ori_im = frame.copy()
    data = {'image': frame}
    data_resize = processing.DetResizeForTest(data)
    data_list = []
    keep_keys = ['image', 'shape']
    for key in keep_keys:
        data_list.append(data_resize[key])
    img, shape_list = data_list

    shape_list = np.expand_dims(shape_list, axis=0) 
    pred = det_results[0]    
    if isinstance(pred, paddle.Tensor):
        pred = pred.numpy()
    segmentation = pred > 0.3

    boxes_batch = []
    for batch_index in range(pred.shape[0]):
        src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
        mask = segmentation[batch_index]
        boxes, scores = processing.boxes_from_bitmap(pred[batch_index], mask, src_w, src_h)
        boxes_batch.append({'points': boxes})
    post_result = boxes_batch
    dt_boxes = post_result[0]['points']
    dt_boxes = processing.filter_tag_det_res(dt_boxes, ori_im.shape)    
    return dt_boxes

# Main processing function for PaddleOCR
def run_paddle_ocr(source=0, flip=False, use_popup=False, skip_first_frames=0):
    """
    Main function to run the paddleOCR inference:
    1. Create a video player to play with target fps (utils.VideoPlayer).
    2. Prepare a set of frames for text detection and recognition.
    3. Run AI inference for both text detection and recognition.
    4. Visualize the results.

    Parameters:
        source: the webcam number to feed the video stream with primary webcam set to "0", or the video path.  
        flip: to be used by VideoPlayer function for flipping capture image
        use_popup: False for showing encoded frames over this notebook, True for creating a popup window.
        skip_first_frames: Number of frames to skip at the beginning of the video. 
    """
    # create video player to play with target fps
    player = None
    try:
        player = utils.VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames)
        # Start video capturing
        player.start()
        if use_popup:
            title = "Press ESC to Exit"
            cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)

        processing_times = collections.deque()
        while True:
            # grab the frame
            frame = player.next()
            if frame is None:
                print("Source ended")
                break
            # if frame larger than full HD, reduce size to improve the performance
            scale = 1280 / max(frame.shape)
            if scale < 1:
                frame = cv2.resize(src=frame, dsize=None, fx=scale, fy=scale,
                                   interpolation=cv2.INTER_AREA)
            # preprocess image for text detection
            test_image = image_preprocess(frame, 640)
                
            # measure processing time for text detection
            start_time = time.time()
            # perform the inference step
            det_results = det_compiled_model([test_image])[det_output_layer]
            stop_time = time.time()

            # Postprocessing for Paddle Detection
            dt_boxes = post_processing_detection(frame, det_results)

            processing_times.append(stop_time - start_time)
            # use processing times from last 200 frames
            if len(processing_times) > 200:
                processing_times.popleft()
            processing_time_det = np.mean(processing_times) * 1000

            # Preprocess detection results for recognition
            dt_boxes = processing.sorted_boxes(dt_boxes)  
            batch_num = 6
            img_crop_list, img_num, indices = prep_for_rec(dt_boxes, frame)
            
            # For storing recognition results, include two parts:
            # txts are the recognized text results, scores are the recognition confidence level 
            rec_res = [['', 0.0]] * img_num
            txts = [] 
            scores = []

            for beg_img_no in range(0, img_num, batch_num):

                # Recognition starts from here
                norm_img_batch = batch_text_box(
                    img_crop_list, img_num, indices, beg_img_no, batch_num)

                # Run inference for text recognition 
                rec_results = rec_compiled_model([norm_img_batch])[rec_output_layer]

                # Postprocessing recognition results
                postprocess_op = processing.build_post_process(processing.postprocess_params)
                rec_result = postprocess_op(rec_results)
                for rno in range(len(rec_result)):
                    rec_res[indices[beg_img_no + rno]] = rec_result[rno]   
                if rec_res:
                    txts = [rec_res[i][0] for i in range(len(rec_res))] 
                    scores = [rec_res[i][1] for i in range(len(rec_res))]
                                   
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            boxes = dt_boxes
            # draw text recognition results beside the image
            draw_img = processing.draw_ocr_box_txt(
                image,
                boxes,
                txts,
                scores,
                drop_score=0.5)

            # Visualize PaddleOCR results
            f_height, f_width = draw_img.shape[:2]
            fps = 1000 / processing_time_det
            cv2.putText(img=draw_img, text=f"Inference time: {processing_time_det:.1f}ms ({fps:.1f} FPS)", 
                        org=(20, 40),fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=f_width / 1000,
                        color=(0, 0, 255), thickness=1, lineType=cv2.LINE_AA)
            
            # use this workaround if there is flickering
            if use_popup: 
                draw_img = cv2.cvtColor(draw_img, cv2.COLOR_RGB2BGR)
                cv2.imshow(winname=title, mat=draw_img)
                key = cv2.waitKey(1)
                # escape = 27
                if key == 27:
                    break
            else:
                # encode numpy array to jpg
                draw_img = cv2.cvtColor(draw_img, cv2.COLOR_RGB2BGR)
                _, encoded_img = cv2.imencode(ext=".jpg", img=draw_img,
                                              params=[cv2.IMWRITE_JPEG_QUALITY, 100])
                # create IPython image
                i = display.Image(data=encoded_img)
                # display the image in this notebook
                display.clear_output(wait=True)
                display.display(i)
            
    # ctrl-c
    except KeyboardInterrupt:
        print("Interrupted")
    # any different error
    except RuntimeError as e:
        print(e)
    finally:
        if player is not None:
            # stop capturing
            player.stop()
        if use_popup:
            cv2.destroyAllWindows()

# Run Live PaddleOCR with OpenVINO
run_paddle_ocr(source=0, flip=False, use_popup=False)

# Test OCR results on video file

video_file = "https://raw.githubusercontent.com/yoyowz/classification/master/images/test.mp4"
run_paddle_ocr(source=video_file, flip=False, use_popup=False, skip_first_frames=0)

#nlp #pos #ngram

working with time series snippets

# https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
df['TIME'] =  pd.to_datetime(df['Time'],unit='s')
df_time = df.set_index('TIME')
# Add columns with year, month, and Weekday Name
df_time['Year'] = df_time.index.year
df_time['Month'] = df_time.index.month
df_time['Weekday Name'] = df_time.index.weekday_name

# Display a random sampling of 5 rows
df_time.sample(5, random_state=0)

# Visualizing time series data
sns.set(rc={'figure.figsize':(11, 4)})

#javascript #shoelace #css #front #human/data/nlp #nlp #nltk #graycloud

Drawer

<sl-drawer label="Drawer" class="drawer-scrolling">
  <div style="height: 150vh; border: dashed 2px var(--sl-color-neutral-200); padding: 0 1rem;">
    <p>Scroll down and give it a try! 👇</p>
  </div>
  <sl-button slot="footer" variant="primary">Close</sl-button>
</sl-drawer>

<sl-button>Open Drawer</sl-button>

<script>
  const drawer = document.querySelector('.drawer-scrolling');
  const openButton = drawer.nextElementSibling;
  const closeButton = drawer.querySelector('sl-button[variant="primary"]');

  openButton.addEventListener('click', () => drawer.show());
  closeButton.addEventListener('click', () => drawer.hide());
</script>

#nlp #pos #ngram

Retrain spacy tagger pos

from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab)
doc = Doc(vocab, words=['I', 'like', 'stuff'])
gold = GoldParse(doc, tags=['N', 'V', 'N'])
tagger.update(doc, gold)
tagger.model.end_training()

#nlp #pos #ngram

n-gram with filter POS tag

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=12) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
python3 -m spacy download en
nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:1])


##################################################################
file=open(product_name,'w');
bags=nltk.bigrams(tagged_sentences)
distribution = nltk.FreqDist(bags)
c = Counter(distribution)
for k,count in c.most_common():
  if ((k[0][1])=='JJ')):
    do something...
###########################################################
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df['species_tokens'] = tokens
df['species_lemma'] = lemma
df['species_pos'] = pos

#pandas #nlp #colab

colab common useful snippets

#--------------install pytorch geometric
!python -c "import torch; print(torch.version.cuda)"
!python -c "import torch; print(torch.__version__)"
# check above version and edit below accordingly

!pip install torch==1.9.0
!pip uninstall -y torch-scatter
!pip uninstall -y torch-sparse
!pip uninstall -y torch-cluster
!pip uninstall -y torch-geometric
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-geometric

#--------------mount drive-------------------
from google.colab import drive
drive.mount('/content/drive')
### File path
TRAIN_ID_PATH = '/content/drive/MyDrive/folder/pytorch/train.csv'

#pandas #nlp

Working with null nan nul

df.isnull().sum()

#check where is null in a columns
df[df["Business Description"].isnull() == True]

is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

print(rows_with_NaN)

#pandas #nlp

convert dataframe to txt, to list

df['line'].iloc[:2981] = 'train'
df['line'].iloc[2982:] = 'test'

with open('job_post_01.txt', 'a') as f:
    dfAsString = df.to_string(header=False, index=True)
    f.write(dfAsString)

f.close()

data = pd.read_csv('job_post_01.csv')
df = data.sample(frac = 1).reset_index(drop = True)
doc_name_list = df.values.tolist()
doc_train_list = df.iloc[:2981].values.tolist()
doc_test_list = df.iloc[2982:].values.tolist()

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

#python #huggingface #nlp

How to delete a layer in pretrained model using Huggingface

def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
    oldModuleList = model.bert.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(0, len(num_layers_to_keep)):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.bert.encoder.layer = newModuleList

    return copyOfModel

#textpreprocessing #nlp #function #pandas

apply function and def

rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()

#textpreprocessing #nlp #pandas #function

replace function in dataframe

for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()

#textpreprocessing #nlp #gnn #dataload #pytorch

A detailed example of data loaders with PyTorch

# Load entire dataset
X, y = torch.load('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for i in range(n_batches):
        # Local batches and labels
        local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
 
        # Your model
        [...]
         
         
# other
# Unoptimized generator
training_generator = SomeSingleCoreGenerator('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for local_X, local_y in training_generator:
        # Your model
        [...]

#python #textpreprocessing #nlp #dataframe #pandas #save #export

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

data[['column1','column2','column3',...]].to_csv('fileNameWhereYouwantToWrite.csv')
     
df = pd.DataFrame()
for i in range():
	#....
	df.appen(text)

#python #textpreprocessing #nlp

Combine columns in dataframe

# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)


# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] = df.Year.str.cat(df.Quarter)
df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')

#python #textpreprocessing #nlp

findall and search text between 2 strings

import re

text = 'this is a text'

try:
    found = re.search('is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling
found

=> a

# To get more than 1 search
job_title = []
for i in range(0,9282):
    text = data.work_experiences.iloc[i]
    try:
        title = re.findall(r"wtitle (.*?) wcompany",text)
    except :
        title = 'onejob'
    job_title.append(title)
    
data['job_title'] = job_title

#python #textpreprocessing #nlp

.partition() (picking up piece of string between separators)

# picking up piece of string between separators
# function using partition, like partition, but drops the separators
def between(left,right,s):
    before,_,a = s.partition(left)
    a,_,after = a.partition(right)
    return before,a,after
 
s = "bla bla blaa <a>data</a> lsdjfasdjöf (important notice) 'Daniweb forum' tcha tcha tchaa"
print between('<a>','</a>',s)
print between('(',')',s)
print between("'","'",s)
 
""" Output:
('bla bla blaa ', 'data', " lsdjfasdj\xc3\xb6f (important notice) 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f ', 'important notice', " 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f (important notice) ', 'Daniweb forum', ' tcha tcha tchaa')
"""

#textpreprocessing #nlp #py

PyParsing - extract the substring between two markers

import pyparsing as pp

word = pp.Word(pp.alphanums)

s = 'gfgfdAAA1234ZZZuijjk'
rule = pp.nestedExpr('AAA', 'ZZZ')
for match in rule.searchString(s):
    print(match)

#textpreprocessing #nlp #py

.partition() (picking up piece of string between separators)

# picking up piece of string between separators
# function using partition, like partition, but drops the separators
def between(left,right,s):
    before,_,a = s.partition(left)
    a,_,after = a.partition(right)
    return before,a,after

s = "bla bla blaa <a>data</a> lsdjfasdjöf (important notice) 'Daniweb forum' tcha tcha tchaa"
print between('<a>','</a>',s)
print between('(',')',s)
print between("'","'",s)

""" Output:
('bla bla blaa ', 'data', " lsdjfasdj\xc3\xb6f (important notice) 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f ', 'important notice', " 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f (important notice) ', 'Daniweb forum', ' tcha tcha tchaa')
"""

How to make Sentiment Analysis usingTextBlob in Python

text_prepare

remove_arabic_punctuations

remove_diacritics

Utility: Preprocessing text w/ Spacy

204-named-entity-recognition: Named Entity Recognition with OpenVINO

405-paddle-ocr-webcam: PaddleOCR with OpenVINO

working with time series snippets

Drawer

Retrain spacy tagger pos

n-gram with filter POS tag

colab common useful snippets

Working with null nan nul

convert dataframe to txt, to list

remove punc and stopword chinese

convert tra to sim chinese

How to delete a layer in pretrained model using Huggingface

apply function and def

replace function in dataframe

A detailed example of data loaders with PyTorch

create and save dataframe to csv

Combine columns in dataframe

findall and search text between 2 strings

.partition() (picking up piece of string between separators)

PyParsing - extract the substring between two markers

.partition() (picking up piece of string between separators)

Save snippets that work with our extensions