Lenta.Ru parsing¶

In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/news-dataset-from-lenta-ru-2019-2023/lenta_ru_news_2019_2023.csv')
In [2]:
lenta_topics = {
    "Россия": 0,
    "Экономика": 1,
    "Силовые структуры": 2,
    "Бывший СССР": 3,
    "Спорт": 4,
    "Забота о себе": 5,
    "Здоровье": 5,
    "Строительство": 6,
    "Путешествия": 7,
    "Наука и техника": 8,
    "Интернет и СМИ": 0,
    "Бизнес": 1,
    }
In [3]:
df["number"] = df["topic"].apply(lambda x: lenta_topics[x] if x in lenta_topics else None)
df = df.dropna(subset=['number'])
df = df.dropna(subset=['text'])

df["number"] = df["number"].apply(lambda x: int(x))
In [4]:
df['number'].value_counts()
Out[4]:
number
0    118456
3     51071
1     49015
2     30249
4     28442
8     25209
7     19308
5      7589
Name: count, dtype: int64

CountVectorizer Train¶

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.number, random_state=13)
In [7]:
vec = CountVectorizer(ngram_range=(1, 1))

bow = vec.fit_transform(x_train)
bow_test = vec.transform(x_test)

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

print(classification_report(y_test, pred))
              precision    recall  f1-score   support

           0       0.89      0.92      0.90     29468
           1       0.93      0.92      0.92     12283
           2       0.93      0.91      0.92      7726
           3       0.90      0.89      0.90     12852
           4       0.99      0.99      0.99      7030
           5       0.93      0.87      0.90      1861
           7       0.96      0.92      0.94      4788
           8       0.95      0.91      0.93      6327

    accuracy                           0.92     82335
   macro avg       0.93      0.92      0.93     82335
weighted avg       0.92      0.92      0.92     82335

TFIDF Train¶

In [8]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

russian_stopwords = stopwords.words('russian')
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [9]:
tfidf = TfidfVectorizer(min_df=5, max_df=0.7, ngram_range=(1, 3), stop_words=russian_stopwords)

bow = tfidf.fit_transform(x_train)
bow_test = tfidf.transform(x_test)

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

model = LogisticRegression(max_iter=200, random_state=42)
model.fit(bow, y_train)
tf_pred = model.predict(bow_test)

print(classification_report(y_test, tf_pred))
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     29468
           1       0.93      0.93      0.93     12283
           2       0.94      0.91      0.93      7726
           3       0.92      0.92      0.92     12852
           4       0.99      0.99      0.99      7030
           5       0.94      0.91      0.92      1861
           7       0.97      0.93      0.95      4788
           8       0.96      0.92      0.94      6327

    accuracy                           0.93     82335
   macro avg       0.94      0.93      0.94     82335
weighted avg       0.93      0.93      0.93     82335