%%html
<style>
div.input {
display:none;
}
</style>
import nltk
with open('result.txt', 'r') as f:
sample = f.read()
##
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
# Print results per sentence
# print extract_entity_names(tree)
entity_names.extend(extract_entity_names(tree))
# Print all entity names
#print entity_names
# Print unique entity names
print(entity_names)
['John C. Malone', 'Columbus Holding', 'Mr. Malone', 'Mr. Malone', 'Mr. Malone', 'Mr. Malone', 'Berkshire Hathaway Inc.', 'SEC', 'Berkshire Hathaway', 'Cox', 'Cox International Stock Fund', 'SEC', 'Cox', 'Harris Associates L.P.', 'SEC', 'Harris', 'Harris Associates', 'Liberty Global', 'Liberty Global', 'SEC', 'Robert R. Bennett', 'Mr. Bennett', 'William H. Gates Ill', 'SEC', 'William H. Gates Ill', 'Bill', 'Melinda Gates Foundation Trust', 'Melinda French Gates', 'SEC', 'LLC', 'ROIC', 'LLC', 'William', 'Jason', 'Concert', 'Liberty Global', 'Sunrise', 'Sunrise', 'Annual Report', 'Liberty Global', 'Offeror', 'Liberty Global Group', 'Liberty Global', 'Sunrise As of August', 'Sunrise', 'Sunrise Shares']
file = open("names-extracted.txt", "w")
for element in entity_names:
print(f"{element}", file=file)
file.close()
import os
os.system('cat names-extracted.txt | sort | uniq > names-sorted-uniq.txt')
#names-sorte-uniq.txt"
0
a_file = open("names-sorted-uniq.txt")
lines = a_file.readlines()
for line in lines:
print(line)
Annual Report Berkshire Hathaway Berkshire Hathaway Inc. Bill Columbus Holding Concert Cox Cox International Stock Fund Harris Harris Associates Harris Associates L.P. Jason John C. Malone Liberty Global Liberty Global Group LLC Melinda French Gates Melinda Gates Foundation Trust Mr. Bennett Mr. Malone Offeror Robert R. Bennett ROIC SEC Sunrise Sunrise As of August Sunrise Shares William William H. Gates Ill