text_prepare
Mon Jan 02 2023 14:51:40 GMT+0000 (UTC)
Saved by
@abdalrahmansh
#python
#kaggle
#nlp
#arabicnlp
#unicode
def delete_links(input_text):
pettern = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'
out_text = re.sub(pettern, ' ', input_text)
return out_text
def delete_repeated_characters(input_text):
pattern = r'(.)\1{2,}'
out_text = re.sub(pattern, r"\1\1", input_text)
return out_text
def replace_letters(input_text):
replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
replace = dict((re.escape(k), v) for k, v in replace.items())
pattern = re.compile("|".join(replace.keys()))
out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
return out_text
def clean_text(input_text):
replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
out_text = re.sub(replace, " ", input_text)
words = nltk.word_tokenize(out_text)
words = [word for word in words if word.isalpha()]
out_text = ' '.join(words)
return out_text
def remove_vowelization(input_text):
vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
out_text = re.sub(vowelization, '', input_text)
return out_text
def delete_stopwords(input_text):
stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokens = tokenizer.tokenize(input_text)
wnl = nltk.WordNetLemmatizer()
lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
out_text = [w for w in lemmatizedTokens if not w in stop_words]
out_text = ' '.join(out_text)
return out_text
def stem_text(input_text):
st = ISRIStemmer()
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokens = tokenizer.tokenize(input_text)
out_text = [st.stem(w) for w in tokens]
out_text = ' '.join(out_text)
return out_text
def text_prepare(input_text, ar_text):
out_text = delete_links(input_text)
out_text = delete_repeated_characters(out_text)
out_text = clean_text(out_text)
out_text = delete_stopwords(out_text)
if ar_text:
out_text = replace_letters(out_text)
out_text = remove_vowelization(out_text)
else:
out_text = out_text.lower()
return out_text
content_copyCOPY
Comments