def delete_links(input_text): pettern = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))' out_text = re.sub(pettern, ' ', input_text) return out_text def delete_repeated_characters(input_text): pattern = r'(.)\1{2,}' out_text = re.sub(pattern, r"\1\1", input_text) return out_text def replace_letters(input_text): replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""} replace = dict((re.escape(k), v) for k, v in replace.items()) pattern = re.compile("|".join(replace.keys())) out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text) return out_text def clean_text(input_text): replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]' out_text = re.sub(replace, " ", input_text) words = nltk.word_tokenize(out_text) words = [word for word in words if word.isalpha()] out_text = ' '.join(words) return out_text def remove_vowelization(input_text): vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE) out_text = re.sub(vowelization, '', input_text) return out_text def delete_stopwords(input_text): stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english")) tokenizer = nltk.tokenize.WhitespaceTokenizer() tokens = tokenizer.tokenize(input_text) wnl = nltk.WordNetLemmatizer() lemmatizedTokens =[wnl.lemmatize(t) for t in tokens] out_text = [w for w in lemmatizedTokens if not w in stop_words] out_text = ' '.join(out_text) return out_text def stem_text(input_text): st = ISRIStemmer() tokenizer = nltk.tokenize.WhitespaceTokenizer() tokens = tokenizer.tokenize(input_text) out_text = [st.stem(w) for w in tokens] out_text = ' '.join(out_text) return out_text def text_prepare(input_text, ar_text): out_text = delete_links(input_text) out_text = delete_repeated_characters(out_text) out_text = clean_text(out_text) out_text = delete_stopwords(out_text) if ar_text: out_text = replace_letters(out_text) out_text = remove_vowelization(out_text) else: out_text = out_text.lower() return out_text