import urllib2 response = urllib2.urlopen('http://www.csc.fi/english/research/sciences/linguistics/taajuussanasto-B9996/download') words = response.read() words = words.decode('utf-8') words = words.splitlines() words[:10] words[10] print words[10] rank = lambda w: int(w[:8]) rank(words[10]) abs_count = lambda w: int(w[8:15]) abs_count(words[10]) rel_count = lambda w: float(w[15:25].replace(',', '.')) rel_count(words[10]) the_word = lambda w: w[25:].split('(')[0] print the_word(words[10]) word_dict = dict( [(the_word(w), (rank(w), abs_count(w), rel_count(w))) for w in words[3:-6]]) from IPython.html.widgets import interact from IPython.display import HTML, display def show_word(n): word = word_dict.keys()[n] s = '

Word: %s

\n' % word for k,v in zip(('rank', 'relative count', 'absolute count'), word_dict[word]): s += '\n'.format(k,v) s += '
{0}{1}
' display(HTML(s)) show_word(3) interact(show_word, n=(0, len(word_dict.keys()) - 1)) from IPython.display import IFrame IFrame('http://www.fincd.com/index.php?txtSearch=tunti&lang=fi', width='100%', height=350) my_word = u'kyllä' my_word.encode('iso-8859-1') urllib2.quote(my_word.encode('iso-8859-1')) print 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(my_word.encode('iso-8859-1')) def show_word_and_translation(n): word = word_dict.keys()[n] s = '

Word: %s

\n' % word for k,v in zip(('rank', 'relative count', 'absolute count'), word_dict[word]): s += '\n'.format(k,v) s += '
{0}{1}
' display(HTML(s)) url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1')) display(IFrame(url, width='100%', height=350)) interact(show_word_and_translation, n=(0, len(word_dict.keys()) - 1)) response = urllib2.urlopen('http://www.fincd.com/finnish/kyll%E4.html') source = response.read() source_split = source.decode("iso-8859-1").splitlines() source_split[85:120] source_split.index(u'') source_split.index(u'
') HTML("".join(source_split[90:118])) src = "".join(source_split[90:118]) import re p = re.compile('') iterator = p.finditer(src) for match in iterator: print match.span() HTML(src[53:367]) def extract_word_definition(source): source_split = source.decode("iso-8859-1").splitlines() start = source_split.index(u'
') stop = source_split.index(u'
') src = "".join(source_split[start:stop]) p = re.compile('') iterator = p.finditer(src) spans = [match.span() for match in iterator] start = spans[0][0] stop = spans[2][0] return src[start:stop] HTML(extract_word_definition(source)) def show_word_and_translation_html_only(n): word = word_dict.keys()[n] s = '

Word: %s

\n' % word for k,v in zip(('rank', 'relative count', 'absolute count'), word_dict[word]): s += '\n'.format(k,v) url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1')) s += extract_word_definition(urllib2.urlopen(url).read()) s += '
{0}{1}
' display(HTML(s)) interact(show_word_and_translation_html_only, n=(0, len(word_dict.keys()) - 1)) word_dict[0] sorted_keys = sorted(word_dict.keys(), key=lambda n:word_dict[n][0]) sorted_keys[:10] def show_word_and_translation_html_only_sorted(n): word = sorted_keys[n] s = '

Word: %s

\n' % word for k,v in zip(('rank', 'relative count', 'absolute count'), word_dict[word]): s += '\n'.format(k,v) url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1')) s += extract_word_definition(urllib2.urlopen(url).read()) s += '
{0}{1}
' display(HTML(s)) interact(show_word_and_translation_html_only_sorted, n=(0, len(word_dict.keys()) - 1)) interact(show_word_and_translation_html_only_sorted, n=(0, 200))