Lenta.Ru parsing¶
In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/news-dataset-from-lenta-ru-2019-2023/lenta_ru_news_2019_2023.csv')
In [2]:
lenta_topics = {
"Россия": 0,
"Экономика": 1,
"Силовые структуры": 2,
"Бывший СССР": 3,
"Спорт": 4,
"Забота о себе": 5,
"Здоровье": 5,
"Строительство": 6,
"Путешествия": 7,
"Наука и техника": 8,
"Интернет и СМИ": 0,
"Бизнес": 1,
}
In [3]:
df["number"] = df["topic"].apply(lambda x: lenta_topics[x] if x in lenta_topics else None)
df = df.dropna(subset=['number'])
df = df.dropna(subset=['text'])
df["number"] = df["number"].apply(lambda x: int(x))
In [4]:
df['number'].value_counts()
Out[4]:
number 0 118456 3 51071 1 49015 2 30249 4 28442 8 25209 7 19308 5 7589 Name: count, dtype: int64
CountVectorizer Train¶
In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.number, random_state=13)
In [7]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
bow_test = vec.transform(x_test)
scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.89 0.92 0.90 29468 1 0.93 0.92 0.92 12283 2 0.93 0.91 0.92 7726 3 0.90 0.89 0.90 12852 4 0.99 0.99 0.99 7030 5 0.93 0.87 0.90 1861 7 0.96 0.92 0.94 4788 8 0.95 0.91 0.93 6327 accuracy 0.92 82335 macro avg 0.93 0.92 0.93 82335 weighted avg 0.92 0.92 0.92 82335
TFIDF Train¶
In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
russian_stopwords = stopwords.words('russian')
[nltk_data] Downloading package stopwords to /usr/share/nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [9]:
tfidf = TfidfVectorizer(min_df=5, max_df=0.7, ngram_range=(1, 3), stop_words=russian_stopwords)
bow = tfidf.fit_transform(x_train)
bow_test = tfidf.transform(x_test)
scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)
model = LogisticRegression(max_iter=200, random_state=42)
model.fit(bow, y_train)
tf_pred = model.predict(bow_test)
print(classification_report(y_test, tf_pred))
precision recall f1-score support 0 0.91 0.93 0.92 29468 1 0.93 0.93 0.93 12283 2 0.94 0.91 0.93 7726 3 0.92 0.92 0.92 12852 4 0.99 0.99 0.99 7030 5 0.94 0.91 0.92 1861 7 0.97 0.93 0.95 4788 8 0.96 0.92 0.94 6327 accuracy 0.93 82335 macro avg 0.94 0.93 0.94 82335 weighted avg 0.93 0.93 0.93 82335