Snippets Collections
def delete_links(input_text):
    pettern  = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items()) 
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    words = [word for word in words if word.isalpha()]
    out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    wnl = nltk.WordNetLemmatizer()
    lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
    out_text = [w for w in lemmatizedTokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text

def stem_text(input_text):
    st = ISRIStemmer()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [st.stem(w) for w in tokens]
    out_text = ' '.join(out_text)
    return out_text


def text_prepare(input_text, ar_text):
    out_text = delete_links(input_text)
    out_text = delete_repeated_characters(out_text)
    out_text = clean_text(out_text)
    out_text = delete_stopwords(out_text)
    if ar_text:
        out_text = replace_letters(out_text)
        out_text = remove_vowelization(out_text)
    else:
        out_text = out_text.lower()
    return out_text
ARABIC_PUNCTUATION = ':"؟!؛،,.؍, '

def remove_arabic_punctuations(text : str) -> str:
    '''
        text : ", أَهْلًا وسَهْلًا Hello 212"
        output : 
            ---> "  أَهْلًا وسَهْلًا Hello 212"
    '''
    chars = [char for char in text if (char not in ARABIC_PUNCTUATION)]
    output = ''.join(chars)
    return output
def remove_diacritics(text : str) -> str:
    '''
        text : "أَهْلًا وسَهْلًا Hello 212"
        output : 
            ---> "أهلا وسهلا Hello 212"
    '''
    chars = [char for char in text if (char not in HARAKAT)]
    output = ''.join(chars)
    return output
more: https://gist.github.com/endolith/157796

ಠ_ಠ
( ͡° ͜ʖ ͡°)
¯\_(ツ)_/¯
(╯°□°)╯︵ ┻━┻ 

http://www.fileformat.info/convert/text/upside-down.htm

ಠ_ಠ [disapprove]
Ծ_Ծ [disapprove]
ಠ~ಠ [hrm…]
ఠ_ఠ [o rly?]
ಠ_ರೃ [dignified]
ಠ_ృ [dignified]
ಠ╭╮ಠ [frown]
◔_◔ [rolling eyes]
𝄇⥀.⥀𝄆 [rolling eyes]
⊙_ʘ [crazy/wonky]
◴_◶ [herp derp]
◕ ◡ ◕ [smile]
(๏̯͡๏﴿ [sad]
(͡๏̯͡๏) [sad]
◔̯◔ [sad]
⊙︿⊙ [sad]
◕︵◕ [sad]
●︵• [sad]
◉︵◉ [really sad]
ಡ_ಡ [misty eyes]
ಥ_ಥ [crying]
ಢ_ಢ [crying]
ಢ_ಥ [heavily distraught]
⊙﹏⊙ [embarrassed]
( ゚o゚) [surprised]
⋋_⋌ [frustrated]
〴⋋_⋌〵[angry]
ಠ益ಠ [rage]
ヽ(`Д´)ノ [raging] ‎(ノ≥∇≤)ノ [raging]
(︶ε︶メ) [deep breaths]
˚▱˚ [gasp]
⊙▂⊙ [gasp]
⊙▃⊙ [bigger gasp]
(ΘεΘ;) [nervous]
(゚ヮ゚) [happy]
〓D [happy]
(´ー`) [content]
(´▽`) [haha]
(゚*゚) [pucker]
(。・_・。) [blush]
♥╭╮♥ [lovesick]
≖◡ಎ≖ [devious]
(///_ಥ) [injured]
(≥_<) [black eye]
ʕʘ‿ʘʔ [doofy]
:-þ [silly]
:^Þ
¯\_(ツ)_/¯ [lol i dunno]
ヘ(◕。◕ヘ) [ballin]
๏_๏ [stare]
◉_◉
ਉ_ਉ [tired]
☼_☼ [bulging|bloodshot]
♨_♨ [CAN’T UNSEE] ☯‿☯ [peace]
(゚ー゚) [cool]
(• ε •) [sheepish]
(`・ω・´)
¬_¬ [glare]
ಸ_ಸ
ↁ_ↁ
ಆ_ಆ
ಊ_ಊ
ಹ_ಹ
ㅎ_ㅎ
【•】_【•】[woah]
(ு८ு_ .:)
(づ。◕‿‿◕。)づ [hug]
(/◔ ◡ ◔)/ [hug]
٩(̾●̮̮̃̾•̃̾)۶ [celebrate]
\(• ◡ •)/ [celebrate]
\( ゚◡゚)/ [celebrate]
⊂(◉‿◉)つ [wave]
ح˚௰˚づ [wave]
╚(•⌂•)╝ [stop]
☜-(ΘLΘ)-☞ [which way?]
☜。◕‿◕。☞
(✌゚∀゚)☞ [point and laugh]
щ(゚Д゚щ) [Dear god why‽]
ლ(ಠ_ಠ ლ) [settle down]
◖|◔◡◉|◗ [HURR DURR]
( ‘-’)人(゚_゚ ) [high-five]
( _)=mm=(^_^ ) [brofist]
(>'o’)> ♥ <('o’<)
⎝⏠⏝⏠⎠ [content]
( ´_⊃`)[content]
(Ō_ƆŎ) [eyebrow raised]
≖_≖ [I see what you did there…]
\|  ̄ヘ ̄|/ [praise me]
̿’ ̿’\̵͇̿̿\з=(•̪●)=ε/̵͇̿̿/’̿’̿ [big guns]
< ('o'<) ( '-’ ) (>‘o’)> v( ‘.’ )v < (' .' )> < ('.'<) ( '.’ ) (>‘.’)> v( ‘.’ )v < (' .' )> [dancing]
♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪┏(・o・)┛♪ [singing/dancing]
Ƹ̵̡Ӝ̵̨̄Ʒ [butterfly]
/╲/\╭ºoꍘoº╮/\╱\ [spider]
(* ・(エ)・ *) [pedobear?]
(\/) (°,,°) (\/) [WOOPwoopwoopwoopwoop]
⊛ठ̯⊛ [bicycle… face?]
ತಟತ [buttface]
’;‘ [scream]
/:€ [Myth busted!]
˚⌇˚ [squirm]
ಈ_ಈ [eyes tied shut] (wat?)
ⓧ_ⓧ [x]
⨂_⨂ [x]
✖_✖ [x]
×̯× [x]
‹•.•›
•ﺑ• [oooh]
(० ्०)
ôヮô
¢‿¢
!⑈ˆ~ˆ!⑈
•(⌚_⌚)• [late]
(▰˘◡˘▰)
۹ↁﮌↁ
乂◜◬◝乂
ತ_ಎತ
ಠﭛಠ [goatee]
⏠⏝⏠
⇎_⇎
흫_흫
句_句
໖_໖
༺‿༻
ಠ , ಥ
१✌◡✌५
१|˚–˚|५
โ๏௰๏ใ ื
◜㍕◝
◷_௰◴
◎ܫ◎
(˚ㄥ_˚)
(˚இ˚)
ộ_ộ
◘_◘ 
◙‿◙
δﺡό
⊂•⊃_⊂•⊃
ح˚ᆺ˚ว
❂‿❂ 
❐‿❑
☾˙❀‿❀˙☽ 
(ΘL_Θ)
●¿_●
《〠_〠》 [f*ck yeah]
حᇂﮌᇂ) [f*ck yeah]
ಠ︵ಠ凸 [f* u]
┌∩┐(>_<)┌∩┐ [f* u]
‹^› ‹(•¿•)› ‹^› [f* u]
✄————- [scissors]
╰▄︻▄╯ [YEAAAAAAHHH]
▄︻┻┳═一 [rifle]
(̅_̅_̅_̅(̅_̅_̅_̅_̅_̅_̅̅_̅()ڪے [cigarette]
( ̲̅:̲̅:̲̅:̲̅[̲̅ ̲̅]̲̅:̲̅:̲̅:̲̅) [band-aid]
ı̴̴̡̡̡ ̡͌l̡̡̡ ̡͌l̡*̡̡ ̴̡ı̴̴̡ ̡̡͡|̲̲̲͡͡͡ ̲▫̲͡ ̲̲̲͡͡π̲̲͡͡ ̲̲͡▫̲̲͡͡ ̲|̡̡̡ ̡ ̴̡ı̴̡̡ ̡͌l̡̡̡̡ [house]
lıllı ((((|̲̅̅●̲̅̅|̲̅̅=̲̅̅|̲̅̅●̲̅̅|)))) ıllı [boombox]
┣▇▇▇═── [needle]
┣▇▇▇═─────────── [*whimper]
╰☆╮ [spinning killblade]
ϟ [Potter]
ಠ_ಠ
star

Mon Jan 02 2023 14:51:40 GMT+0000 (Coordinated Universal Time)

#python #kaggle #nlp #arabicnlp #unicode
star

Sun Dec 25 2022 15:54:09 GMT+0000 (Coordinated Universal Time)

#python #kaggle #nlp #arabicnlp #unicode
star

Sun Dec 25 2022 15:52:08 GMT+0000 (Coordinated Universal Time)

#python #kaggle #nlp #arabicnlp #unicode
star

Wed Jun 16 2021 13:49:40 GMT+0000 (Coordinated Universal Time) http://wrttn.me/30dbfd/

#unicode #emoticons #dongers

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension