import pandas as pd
import text_summarizer
import rouge
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
!ls .data
test.txt.src test.txt.tgt
!mkdir -p .data
!wget -nc -O .data/test.txt.tgt https://raw.githubusercontent.com/lambdaofgod/project_data/master/summarization/cnn-dm/test.txt.tgt.tagged
!wget -nc -O .data/test.txt.src https://raw.githubusercontent.com/lambdaofgod/project_data/master/summarization/cnn-dm/test.txt.src
File ‘.data/test.txt.tgt’ already there; not retrieving. File ‘.data/test.txt.src’ already there; not retrieving.
def map_parallel(f, iter):
exc = ProcessPoolExecutor()
return exc.map(f, iter)
def load_texts(path):
return pd.Series(open(path, 'r').readlines())
def maybe_summarize_texts(summarization_method, texts):
return pd.Series(list(tqdm(map_parallel(summarization_method, texts), total=len(texts))))
def flatten_rouge_dict(rouge_dict):
return {k + '-' + subk: val for (k, dist_val) in rouge_dict.items() for (subk, val) in dist_val.items()}
def get_rouge_df(summaries, references):
valid_summaries_indices = summaries != ''
r = rouge.Rouge()
scores = r.get_scores(summaries[valid_summaries_indices], references[valid_summaries_indices])
return pd.DataFrame([flatten_rouge_dict(score) for score in scores])
input_texts = load_texts('.data/test.txt.src')
reference_summaries = load_texts('.data/test.txt.tgt')
input_lengths = input_texts.str.split().apply(len)
reference_summaries_lengths = reference_summaries.str.split().apply(len)
lengths_df = pd.DataFrame({'input': input_lengths, 'summary': reference_summaries_lengths})
lengths_df.describe()
input | summary | |
---|---|---|
count | 11490.000000 | 11490.000000 |
mean | 778.267885 | 66.074151 |
std | 399.743713 | 26.906939 |
min | 59.000000 | 11.000000 |
25% | 475.000000 | 49.000000 |
50% | 701.000000 | 62.000000 |
75% | 998.000000 | 76.000000 |
max | 2380.000000 | 738.000000 |
def target_summary_length(text, summary_length_ratio=0.2):
return int(len(text.split()) * summary_length_ratio)
cbow_summarizer = text_summarizer.CentroidBOWSummarizer(length_limit=3)
def summarize_with_cbow(text):
try:
summary = cbow_summarizer.summarize(text, limit=target_summary_length(text))
except:
summary = ''
return summary
%%time
embeddings = text_summarizer.centroid_word_embeddings.load_gensim_embedding_model('glove-wiki-gigaword-50');
CPU times: user 18.8 s, sys: 318 ms, total: 19.1 s Wall time: 20 s
embedding_summarizer = text_summarizer.CentroidWordEmbeddingsSummarizer(embeddings, length_limit=3)
def summarize_with_embeddings(text):
try:
summary = embedding_summarizer.summarize(text, limit=target_summary_length(text))
except:
summary = ''
return summary
cbow_summaries = maybe_summarize_texts(summarize_with_cbow, input_texts)
100%|██████████| 11490/11490 [02:46<00:00, 68.94it/s]
%%time
cbow_rouge_df = get_rouge_df(cbow_summaries, reference_summaries)
CPU times: user 3min 2s, sys: 504 ms, total: 3min 2s Wall time: 3min 3s
import summa
def summarize_with_textrank(text):
try:
summary = summa.summarizer.summarize(text, words=target_summary_length(text))
except:
summary = ''
return summary
%%time
textrank_summaries = maybe_summarize_texts(summarize_with_textrank, input_texts);
100%|██████████| 11490/11490 [03:11<00:00, 60.12it/s]
CPU times: user 8.69 s, sys: 1.34 s, total: 10 s Wall time: 3min 13s
%%time
textrank_scores_df = get_rouge_df(textrank_summaries, reference_summaries)
textrank_scores_df
CPU times: user 2min 39s, sys: 295 ms, total: 2min 40s Wall time: 2min 40s
textrank_scores_df[[col for col in textrank_scores_df.columns if '-r' in col]].mean()
rouge-1-r 0.505383 rouge-2-r 0.201163 rouge-l-r 0.462364 dtype: float64
cbow_rouge_df[[col for col in cbow_rouge_df.columns if '-r' in col]].mean()
rouge-1-r 0.475737 rouge-2-r 0.175338 rouge-l-r 0.433820 dtype: float64
embedding_summaries = maybe_summarize_texts(summarize_with_embeddings, input_texts)
/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:696: RuntimeWarning: overflow encountered in square uu = np.average(np.square(u), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:696: RuntimeWarning: overflow encountered in square uu = np.average(np.square(u), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:696: RuntimeWarning: overflow encountered in square uu = np.average(np.square(u), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:696: RuntimeWarning: overflow encountered in square uu = np.average(np.square(u), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:695: RuntimeWarning: overflow encountered in multiply uv = np.average(u * v, weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:697: RuntimeWarning: overflow encountered in square vv = np.average(np.square(v), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: invalid value encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 1%| | 112/11490 [00:00<00:11, 968.46it/s]/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:697: RuntimeWarning: overflow encountered in square vv = np.average(np.square(v), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:695: RuntimeWarning: overflow encountered in multiply uv = np.average(u * v, weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: invalid value encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 1%|▏ | 165/11490 [00:00<00:56, 199.52it/s]/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:697: RuntimeWarning: overflow encountered in square vv = np.average(np.square(v), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:695: RuntimeWarning: overflow encountered in multiply uv = np.average(u * v, weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: invalid value encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 10%|█ | 1165/11490 [00:19<02:55, 58.77it/s]/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:697: RuntimeWarning: overflow encountered in square vv = np.average(np.square(v), weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:695: RuntimeWarning: overflow encountered in multiply uv = np.average(u * v, weights=w) /home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: invalid value encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 57%|█████▋ | 6539/11490 [02:12<01:40, 49.27it/s]/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: overflow encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 87%|████████▋ | 10009/11490 [03:40<00:32, 45.49it/s]/home/kuba/anaconda3/lib/python3.6/site-packages/scipy/spatial/distance.py:698: RuntimeWarning: overflow encountered in float_scalars dist = 1.0 - uv / np.sqrt(uu * vv) 100%|██████████| 11490/11490 [04:12<00:00, 45.44it/s]
embedding_scores_df = get_rouge_df(embedding_summaries, reference_summaries)
embedding_scores_df[[col for col in embedding_scores_df.columns if '-r' in col]].mean()
rouge-1-r 0.469270 rouge-2-r 0.157695 rouge-l-r 0.425080 dtype: float64