import urllib2
response = urllib2.urlopen('http://www.csc.fi/english/research/sciences/linguistics/taajuussanasto-B9996/download')
words = response.read()
words = words.decode('utf-8')
words = words.splitlines()
words[:10]
words[10]
print words[10]
rank = lambda w: int(w[:8])
rank(words[10])
abs_count = lambda w: int(w[8:15])
abs_count(words[10])
rel_count = lambda w: float(w[15:25].replace(',', '.'))
rel_count(words[10])
the_word = lambda w: w[25:].split('(')[0]
print the_word(words[10])
word_dict = dict(
[(the_word(w),
(rank(w), abs_count(w), rel_count(w))) for w in words[3:-6]])
from IPython.html.widgets import interact
from IPython.display import HTML, display
def show_word(n):
word = word_dict.keys()[n]
s = '
Word: %s
\n' % word
for k,v in zip(('rank', 'relative count', 'absolute count'),
word_dict[word]):
s += '{0} | {1} |
\n'.format(k,v)
s += '
'
display(HTML(s))
show_word(3)
interact(show_word,
n=(0, len(word_dict.keys()) - 1))
from IPython.display import IFrame
IFrame('http://www.fincd.com/index.php?txtSearch=tunti&lang=fi', width='100%', height=350)
my_word = u'kyllä'
my_word.encode('iso-8859-1')
urllib2.quote(my_word.encode('iso-8859-1'))
print 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(my_word.encode('iso-8859-1'))
def show_word_and_translation(n):
word = word_dict.keys()[n]
s = 'Word: %s
\n' % word
for k,v in zip(('rank', 'relative count', 'absolute count'),
word_dict[word]):
s += '{0} | {1} |
\n'.format(k,v)
s += '
'
display(HTML(s))
url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
display(IFrame(url, width='100%', height=350))
interact(show_word_and_translation,
n=(0, len(word_dict.keys()) - 1))
response = urllib2.urlopen('http://www.fincd.com/finnish/kyll%E4.html')
source = response.read()
source_split = source.decode("iso-8859-1").splitlines()
source_split[85:120]
source_split.index(u'')
source_split.index(u'')
HTML("".join(source_split[90:118]))
src = "".join(source_split[90:118])
import re
p = re.compile('')
iterator = p.finditer(src)
for match in iterator:
print match.span()
HTML(src[53:367])
def extract_word_definition(source):
source_split = source.decode("iso-8859-1").splitlines()
start = source_split.index(u'')
stop = source_split.index(u'')
src = "".join(source_split[start:stop])
p = re.compile('')
iterator = p.finditer(src)
spans = [match.span() for match in iterator]
start = spans[0][0]
stop = spans[2][0]
return src[start:stop]
HTML(extract_word_definition(source))
def show_word_and_translation_html_only(n):
word = word_dict.keys()[n]
s = 'Word: %s
\n' % word
for k,v in zip(('rank', 'relative count', 'absolute count'),
word_dict[word]):
s += '{0} | {1} |
\n'.format(k,v)
url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
s += extract_word_definition(urllib2.urlopen(url).read())
s += '
'
display(HTML(s))
interact(show_word_and_translation_html_only,
n=(0, len(word_dict.keys()) - 1))
word_dict[0]
sorted_keys = sorted(word_dict.keys(), key=lambda n:word_dict[n][0])
sorted_keys[:10]
def show_word_and_translation_html_only_sorted(n):
word = sorted_keys[n]
s = 'Word: %s
\n' % word
for k,v in zip(('rank', 'relative count', 'absolute count'),
word_dict[word]):
s += '{0} | {1} |
\n'.format(k,v)
url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
s += extract_word_definition(urllib2.urlopen(url).read())
s += '
'
display(HTML(s))
interact(show_word_and_translation_html_only_sorted,
n=(0, len(word_dict.keys()) - 1))
interact(show_word_and_translation_html_only_sorted,
n=(0, 200))