Load your dataset
df = pd.read_excel(“/content/sample_data/report.xlsx”)
df = df.dropna(subset=[“desc”])
df = df.drop_duplicates(subset=[“desc”])
df = df.reset_index(drop=True)
Clean text data
def cleantwt(twt):
if isinstance(twt, str):
twt = re.sub(“RT”, " “, twt)
twt = re.sub(”#[A-Za_z0-9]+“, “”, twt)
twt = re.sub(r’[۰-۹]+‘, ‘’, twt)
twt = re.sub(’[۰۱۲۳۴۵۶۷۸۹]+‘,’‘,twt)
twt = re.sub(r’[A-Za-z]', ‘’, twt)
twt = re.sub(”‘’“, “”, twt)
twt = re.sub(”\n", “”, twt)
twt = re.sub(“https?://\S+”, “”, twt)
twt = re.sub(‘@[\S]‘, ‘’, twt)
twt = re.sub(’[1]+|[\s]+', '', twt)
twt = re.sub(r'[+&+;+]|[><!+،:,().+،+٬+,+]|[-+]|[…]|[[]»«//]|[\]|[#+]|[_+]|[٪+]|[%]|[+]|[؟+]|[?+]|[“”]|@’, ’ ',twt)
return twt
else:
return twt
df[“clean_text”] = df[“desc”].apply(cleantwt)
Load stopwords
with open(‘/content/sample_data/stopwords.txt’, ‘r’, encoding=‘utf-8’) as file:
stopwords_ur = file.read().splitlines()
Remove stopwords
def remove_stopwords(text):
return [word for word in text if word not in stopwords_ur]
Tokenize and remove stopwords
def preprocess(text):
tokens = gensim.utils.simple_preprocess(text)
tokens_no_stopwords = remove_stopwords(tokens)
return tokens_no_stopwords
Apply preprocessing
df[“clean_text_no_stopwords”] = df[“clean_text”].apply(preprocess)
Create N-grams
def make_n_grams(texts):
bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[texts], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)
bigrams_text = [bigram_mod[doc] for doc in texts]
trigrams_text = [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
return trigrams_text
Apply N-grams
data_ngrams = make_n_grams(df[“clean_text_no_stopwords”])
data_strings = [" ".join(doc) for doc in data_ngrams]
id2word = corpora.Dictionary(data_ngrams)
texts = data_ngrams
corpus = [id2word.doc2bow(text) for text in texts]
code after this
\s ↩︎