import numpy as np
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
raw_df = pd.read_csv("article.txt", sep='\n', header=None)
def line_process(s):
s = s.lower()
s = re.sub("[\)\(\.,—\-:«»\t!]", " ", s)
while s.find(" ") != -1:
s = re.sub(" ", " ", s)
s = s.strip()
return s
df_1 = raw_df[0].apply(line_process)
df_arr = np.concatenate(df_1.apply(lambda s: s.split()))
df = pd.DataFrame(
data=[df_arr[:], np.ones(df_arr.size)[:]]
).transpose().rename({0:'word', 1:'count'}, axis='columns')
df.describe()
word | count | |
---|---|---|
count | 2277 | 2277.0 |
unique | 1062 | 1.0 |
top | и | 1.0 |
freq | 91 | 2277.0 |
df_count = df.groupby('word').count()
df_count.head()
count | |
---|---|
word | |
0 | 2 |
10 | 2 |
100 | 1 |
13 | 2 |
14 | 1 |
ya = df[(df['word'] == "я") |
(df['word'] == "меня") |
(df['word'] == 'мне') |
(df['word'] == 'мной')]
ya.groupby('word').count()
count | |
---|---|
word | |
меня | 24 |
мне | 6 |
мной | 1 |
я | 69 |
nas = df[(df['word'] == "мы") |
(df['word'] == "нам") |
(df['word'] == 'нас') |
(df['word'] == 'нами')]
nas.groupby('word').count()
count | |
---|---|
word | |
мы | 7 |
нам | 1 |
нас | 3 |
pronouns = df[(df['word'] == "я") |
(df['word'] == "меня") |
(df['word'] == 'мне') |
(df['word'] == 'мной') |
(df['word'] == "мы") |
(df['word'] == "нам") |
(df['word'] == 'нас') |
(df['word'] == 'нами') |
(df['word'] == "ты") |
(df['word'] == "они") |
(df['word'] == 'тех') |
(df['word'] == 'те') |
(df['word'] == 'она') |
(df['word'] == 'он') |
(df['word'] == 'оно') |
(df['word'] == 'те') |
(df['word'] == 'вы') |
(df['word'] == 'себя') |
(df['word'] == 'свои')]
pronouns.groupby('word').count().transpose()
word | вы | меня | мне | мной | мы | нам | нас | он | она | они | свои | себя | те | тех | ты | я |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1 | 24 | 6 | 1 | 7 | 1 | 3 | 4 | 5 | 5 | 3 | 5 | 2 | 1 | 11 | 69 |
te = df[(df['word'] == "ты") |
(df['word'] == "они") |
(df['word'] == 'тех') |
(df['word'] == 'те') |
(df['word'] == 'она') |
(df['word'] == 'он') |
(df['word'] == 'оно') |
(df['word'] == 'те') |
(df['word'] == 'вы') |
(df['word'] == 'себя')]
te.groupby('word').count().sum()
count 34 dtype: int64
companies = df[(df['word'] == "viber") |
(df['word'] == "juno") |
(df['word'] == 'epam') |
(df['word'] == 'ciklum') |
(df['word'] == 'tietoenator') |
(df['word'] == 'tieto') |
(df['word'] == 'itransition')]
companies_count = companies.groupby('word').count()
companies_count.transpose()
word | ciklum | epam | itransition | juno | tieto | tietoenator | viber |
---|---|---|---|---|---|---|---|
count | 5 | 4 | 1 | 14 | 2 | 1 | 3 |
companies[companies['word'] == 'juno'].count()
word 14 count 14 dtype: int64
companies[~(companies['word'] == 'juno')].count()
word 16 count 16 dtype: int64