import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
import glob
from collections import Counter
import re
import jieba
import pdftotext
pdf_files = glob.glob('pdf/*')
word_occ = dict()
num_unique_words = np.zeros(len(pdf_files)+1)
wordlengths = np.zeros(len(pdf_files))
for i, pdf_file in enumerate(pdf_files, start=1):
# read all text from PDF file
with open(pdf_file, "rb") as f:
text = '\n\n'.join(pdftotext.PDF(f))
# strip out all space characters
text = text.replace(' ', '')
# segment text into words
seg_list = jieba.lcut(text, cut_all=False)
# remove all words that don't contain Chinese
ch_only_seg_list = list()
for j, seg in enumerate(seg_list):
if re.search('[\u4e00-\u9FFF]', seg):
ch_only_seg_list.append(seg)
# wordlength of test
wordlengths[i-1] = len(seg_list)
# count occurrences of each word
pdf_word_occ = Counter(ch_only_seg_list)
# add word occurrences to totals
for word, occ in pdf_word_occ.items():
if word in word_occ.keys():
# the word was on an earlier test PDF
word_occ[word] += occ
else:
# the word hasn't been seen yet
word_occ.update({word:occ})
num_unique_words[i] += 1
Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache Loading model cost 0.890 seconds. Prefix dict has been built succesfully.
# cumulative sum of unique words
total_unique_words = np.cumsum(num_unique_words)
# fraction words on each test that were new
frac_new = np.zeros(len(pdf_files)+1)
frac_new[0] = 100
frac_new[1:] = num_unique_words[1:]/wordlengths*100
print(wordlengths)
print(np.mean(wordlengths))
print(np.sum(wordlengths))
[11769. 11998. 11778. 11847. 6465. 11813. 11817. 11621. 7528. 11808. 12080. 11956. 11712. 7470. 11890. 11841. 11902. 6477. 7552. 11931. 7420.] 10508.333333333334 220675.0
fig, ax = plt.subplots(figsize=(6,8))
ax.plot(np.arange(len(total_unique_words)), total_unique_words, marker='o')
ax2=ax.twinx()
ax2.plot(np.arange(len(total_unique_words)),frac_new, color='g')
ax.tick_params(axis='both', labelsize=12)
ax.yaxis.set_minor_locator(AutoMinorLocator(5))
ax.grid(which='both')
ax.set_title('counting unique words in HSK level 6 practice exams', fontsize=14)
ax.set_xlabel('number of exams', fontsize=14)
ax.set_ylabel('cumulative unique words', fontsize=14, color='b')
ax.set_ylim([0, 25000])
ax2.set_ylabel('unique words per exam [%]', fontsize=14, color='g')
ax2.set_ylim([0,50])
ax.text(7,1250,'github.com/JackElsey/hsk6-vocab-analysis')
plt.savefig('wordcount_plot', bbox_inches='tight')
# sort dictionary of words by occurrences
sorted_word_occ = {k: v for k, v in sorted(word_occ.items(), key=lambda item: item[1], reverse=True)}
# output text file of words and occurrences
with open('wordlist.csv', 'w') as outfile:
for word, occ in sorted_word_occ.items():
outfile.write(word+','+str(occ)+'\r\n')