%load_ext autoreload
%autoreload 2
import cytoolz as tlz
from text2math import raw2text as r2t
from text2math import text2tokens as t2t
from text2math import tokens2numbers as t2n
import text2math as txt2m
get_text_from_xml_file
extracts the Title and Body for each post.
TXT_STREAM = r2t.get_text_from_xml_file("../data/stackoverflow/Posts.xml")
TXT = TXT_STREAM.next()
print(TXT)
How do I build a GCC 4.7 toolchain for cross-compiling? <p>I already asked this <a href="http://stackoverflow.com/questions/10973020/cross-compilation-for-raspberry-pi-in-gcc-where-to-start">question</a> on Stack Overflow, but I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one.</p> <p>Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes.</p> <p>I am eager to get compiling and I would like to use the latest and the best tools.</p>
remove_html_bits
uses BeautifulSoup and lxml to remove the HTML tags.
NO_HTML = r2t.remove_html_bits(TXT)
print(r2t.remove_html_bits(NO_HTML))
How do I build a GCC 4.7 toolchain for cross-compiling? I already asked this question on Stack Overflow, but I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one. Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes. I am eager to get compiling and I would like to use the latest and the best tools.
UNICODE = r2t.decode_and_fix(NO_HTML)
print(UNICODE)
How do I build a GCC 4.7 toolchain for cross-compiling? I already asked this question on Stack Overflow, but I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one. Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes. I am eager to get compiling and I would like to use the latest and the best tools.
Here's a better example of fixing text encoding
MESSED_UP_TEXT = b'ünicode'
print(MESSED_UP_TEXT)
ünicode
r2t.decode_and_fix(MESSED_UP_TEXT)
u'unicode'
Broken down by component:
UNI_MESSED_UP_TEXT = r2t.adv_decode(MESSED_UP_TEXT)
print(UNI_MESSED_UP_TEXT)
ünicode
CLEAN_UNI_MESSED_UP_TEXT = r2t.clean_unicode(UNI_MESSED_UP_TEXT)
print(CLEAN_UNI_MESSED_UP_TEXT)
ünicode
print(r2t.normize_text(CLEAN_UNI_MESSED_UP_TEXT))
unicode
CLEAN_TXT = tlz.pipe(TXT,
r2t.remove_html_bits,
r2t.adv_decode,
r2t.clean_unicode,
r2t.normize_text)
print(CLEAN_TXT)
How do I build a GCC 4.7 toolchain for cross-compiling? I already asked this question on Stack Overflow, but I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one. Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes. I am eager to get compiling and I would like to use the latest and the best tools.
The Goal:
@tlz.curry
def ngram_tuples(n, string, minlen=3, maxlen=25):
return tlz.pipe(string,
lower,
simple_split,
filter_longer_than(maxlen),
compose(concat, map, splitter_of_words),
filter_shorter_than(minlen),
filter_stopwords,
sliding_window_c(n),
map_c(join_strings("_")))
lower
LOW_TXT = t2t.lower(CLEAN_TXT)
print(LOW_TXT)
how do i build a gcc 4.7 toolchain for cross-compiling? i already asked this question on stack overflow, but i would like to know if anyone managed to build a gcc 4.7 toolchain for arm cross-compilation (for a x86/x86-64 linux host). there are many instructins for building gcc from source and many available cross-compilers for pre-4.7 gcc versions, just not the latest one. compiling on rasp pi itself works fine but is just a bit too slow for practical purposes. i am eager to get compiling and i would like to use the latest and the best tools.
simple_split
SMPL_SPLIT_TXT = t2t.simple_split(LOW_TXT)
print(SMPL_SPLIT_TXT)
[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'cross-compiling?', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow,', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'arm', u'cross-compilation', u'(for', u'a', u'x86/x86-64', u'linux', u'host).', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross-compilers', u'for', u'pre-4.7', u'gcc', u'versions,', u'just', u'not', u'the', u'latest', u'one.', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes.', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools.']
LONG_FILTERED = list(t2t.filter_longer_than(25, SMPL_SPLIT_TXT))
print(LONG_FILTERED)
[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'cross-compiling?', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow,', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'arm', u'cross-compilation', u'(for', u'a', u'x86/x86-64', u'linux', u'host).', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross-compilers', u'for', u'pre-4.7', u'gcc', u'versions,', u'just', u'not', u'the', u'latest', u'one.', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes.', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools.']
NON_ALPHANUM_SPLIT = list(tlz.concat(tlz.map(t2t.splitter_of_words, LONG_FILTERED)))
print(NON_ALPHANUM_SPLIT)
[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4', u'7', u'toolchain', u'for', u'cross', u'compiling', u'', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow', u'', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4', u'7', u'toolchain', u'for', u'arm', u'cross', u'compilation', u'', u'for', u'a', u'x86', u'x86', u'64', u'linux', u'host', u'', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross', u'compilers', u'for', u'pre', u'4', u'7', u'gcc', u'versions', u'', u'just', u'not', u'the', u'latest', u'one', u'', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes', u'', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools', u'']
SHORT_FILTERED = list(t2t.filter_shorter_than(3, NON_ALPHANUM_SPLIT))
print(SHORT_FILTERED)
[u'how', u'build', u'gcc', u'toolchain', u'for', u'cross', u'compiling', u'already', u'asked', u'this', u'question', u'stack', u'overflow', u'but', u'would', u'like', u'know', u'anyone', u'managed', u'build', u'gcc', u'toolchain', u'for', u'arm', u'cross', u'compilation', u'for', u'x86', u'x86', u'linux', u'host', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross', u'compilers', u'for', u'pre', u'gcc', u'versions', u'just', u'not', u'the', u'latest', u'one', u'compiling', u'rasp', u'itself', u'works', u'fine', u'but', u'just', u'bit', u'too', u'slow', u'for', u'practical', u'purposes', u'eager', u'get', u'compiling', u'and', u'would', u'like', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools']
NON_STOP = list(t2t.filter_stopwords(SHORT_FILTERED))
print(NON_STOP)
[u'build', u'gcc', u'toolchain', u'cross', u'compiling', u'asked', u'question', u'stack', u'overflow', u'like', u'know', u'managed', u'build', u'gcc', u'toolchain', u'arm', u'cross', u'compilation', u'x86', u'x86', u'linux', u'host', u'instructins', u'building', u'gcc', u'source', u'available', u'cross', u'compilers', u'pre', u'gcc', u'versions', u'latest', u'compiling', u'rasp', u'works', u'fine', u'bit', u'slow', u'practical', u'purposes', u'eager', u'compiling', u'like', u'use', u'latest', u'best', u'tools']
UNI_T = list(t2t.sliding_window_c(1, NON_STOP))
print(UNI_T)
[(u'build',), (u'gcc',), (u'toolchain',), (u'cross',), (u'compiling',), (u'asked',), (u'question',), (u'stack',), (u'overflow',), (u'like',), (u'know',), (u'managed',), (u'build',), (u'gcc',), (u'toolchain',), (u'arm',), (u'cross',), (u'compilation',), (u'x86',), (u'x86',), (u'linux',), (u'host',), (u'instructins',), (u'building',), (u'gcc',), (u'source',), (u'available',), (u'cross',), (u'compilers',), (u'pre',), (u'gcc',), (u'versions',), (u'latest',), (u'compiling',), (u'rasp',), (u'works',), (u'fine',), (u'bit',), (u'slow',), (u'practical',), (u'purposes',), (u'eager',), (u'compiling',), (u'like',), (u'use',), (u'latest',), (u'best',), (u'tools',)]
BI_T = list(t2t.sliding_window_c(2, NON_STOP))
print(BI_T)
[(u'build', u'gcc'), (u'gcc', u'toolchain'), (u'toolchain', u'cross'), (u'cross', u'compiling'), (u'compiling', u'asked'), (u'asked', u'question'), (u'question', u'stack'), (u'stack', u'overflow'), (u'overflow', u'like'), (u'like', u'know'), (u'know', u'managed'), (u'managed', u'build'), (u'build', u'gcc'), (u'gcc', u'toolchain'), (u'toolchain', u'arm'), (u'arm', u'cross'), (u'cross', u'compilation'), (u'compilation', u'x86'), (u'x86', u'x86'), (u'x86', u'linux'), (u'linux', u'host'), (u'host', u'instructins'), (u'instructins', u'building'), (u'building', u'gcc'), (u'gcc', u'source'), (u'source', u'available'), (u'available', u'cross'), (u'cross', u'compilers'), (u'compilers', u'pre'), (u'pre', u'gcc'), (u'gcc', u'versions'), (u'versions', u'latest'), (u'latest', u'compiling'), (u'compiling', u'rasp'), (u'rasp', u'works'), (u'works', u'fine'), (u'fine', u'bit'), (u'bit', u'slow'), (u'slow', u'practical'), (u'practical', u'purposes'), (u'purposes', u'eager'), (u'eager', u'compiling'), (u'compiling', u'like'), (u'like', u'use'), (u'use', u'latest'), (u'latest', u'best'), (u'best', u'tools')]
_UNIGRAMS = map(t2t.join_strings("_"), UNI_T)
print(_UNIGRAMS)
[u'build', u'gcc', u'toolchain', u'cross', u'compiling', u'asked', u'question', u'stack', u'overflow', u'like', u'know', u'managed', u'build', u'gcc', u'toolchain', u'arm', u'cross', u'compilation', u'x86', u'x86', u'linux', u'host', u'instructins', u'building', u'gcc', u'source', u'available', u'cross', u'compilers', u'pre', u'gcc', u'versions', u'latest', u'compiling', u'rasp', u'works', u'fine', u'bit', u'slow', u'practical', u'purposes', u'eager', u'compiling', u'like', u'use', u'latest', u'best', u'tools']
_BIGRAMS = map(t2t.join_strings("_"), BI_T)
print(_BIGRAMS)
[u'build_gcc', u'gcc_toolchain', u'toolchain_cross', u'cross_compiling', u'compiling_asked', u'asked_question', u'question_stack', u'stack_overflow', u'overflow_like', u'like_know', u'know_managed', u'managed_build', u'build_gcc', u'gcc_toolchain', u'toolchain_arm', u'arm_cross', u'cross_compilation', u'compilation_x86', u'x86_x86', u'x86_linux', u'linux_host', u'host_instructins', u'instructins_building', u'building_gcc', u'gcc_source', u'source_available', u'available_cross', u'cross_compilers', u'compilers_pre', u'pre_gcc', u'gcc_versions', u'versions_latest', u'latest_compiling', u'compiling_rasp', u'rasp_works', u'works_fine', u'fine_bit', u'bit_slow', u'slow_practical', u'practical_purposes', u'purposes_eager', u'eager_compiling', u'compiling_like', u'like_use', u'use_latest', u'latest_best', u'best_tools']
TRIGRAMS = tuple(tlz.pipe(CLEAN_TXT,
t2t.lower,
t2t.simple_split,
t2t.filter_longer_than(25), # curried
tlz.compose(tlz.concat, t2t.map_c(t2t.splitter_of_words)),
t2t.filter_shorter_than(3), # curried
t2t.filter_stopwords,
t2t.sliding_window_c(3), # curried
t2t.map_c(t2t.join_strings("_")))) # A lot of curry
print(TRIGRAMS)
(u'build_gcc_toolchain', u'gcc_toolchain_cross', u'toolchain_cross_compiling', u'cross_compiling_asked', u'compiling_asked_question', u'asked_question_stack', u'question_stack_overflow', u'stack_overflow_like', u'overflow_like_know', u'like_know_managed', u'know_managed_build', u'managed_build_gcc', u'build_gcc_toolchain', u'gcc_toolchain_arm', u'toolchain_arm_cross', u'arm_cross_compilation', u'cross_compilation_x86', u'compilation_x86_x86', u'x86_x86_linux', u'x86_linux_host', u'linux_host_instructins', u'host_instructins_building', u'instructins_building_gcc', u'building_gcc_source', u'gcc_source_available', u'source_available_cross', u'available_cross_compilers', u'cross_compilers_pre', u'compilers_pre_gcc', u'pre_gcc_versions', u'gcc_versions_latest', u'versions_latest_compiling', u'latest_compiling_rasp', u'compiling_rasp_works', u'rasp_works_fine', u'works_fine_bit', u'fine_bit_slow', u'bit_slow_practical', u'slow_practical_purposes', u'practical_purposes_eager', u'purposes_eager_compiling', u'eager_compiling_like', u'compiling_like_use', u'like_use_latest', u'use_latest_best', u'latest_best_tools')
print(t2n.freq(list(t2t.unigram(CLEAN_TXT))))
[(u'pre', 1), (u'managed', 1), (u'overflow', 1), (u'purposes', 1), (u'linux', 1), (u'tools', 1), (u'arm', 1), (u'best', 1), (u'gcc', 4), (u'slow', 1), (u'source', 1), (u'fine', 1), (u'question', 1), (u'cross', 3), (u'eager', 1), (u'compiling', 3), (u'build', 2), (u'rasp', 1), (u'compilers', 1), (u'available', 1), (u'x86', 2), (u'compilation', 1), (u'use', 1), (u'host', 1), (u'know', 1), (u'bit', 1), (u'stack', 1), (u'building', 1), (u'toolchain', 2), (u'like', 2), (u'versions', 1), (u'practical', 1), (u'instructins', 1), (u'works', 1), (u'asked', 1), (u'latest', 2)]
tkn_maker = tlz.compose(tuple, t2t.uni_and_bigram_tuples, r2t.decode_and_fix, r2t.remove_html_bits)
POST_STREAM = list(r2t.get_text_from_xml_file("../data/stackoverflow/Posts.xml"))
Serial Version
%time OUT0 = txt2m.total_counts(tlz.map(tkn_maker, POST_STREAM))
CPU times: user 1min 9s, sys: 583 ms, total: 1min 10s Wall time: 1min 10s
%time OUT0 = tlz.pipe(tlz.map(tkn_maker, POST_STREAM), txt2m.total_counts)
CPU times: user 1min 16s, sys: 1.16 s, total: 1min 17s Wall time: 1min 20s
Parallel Version
from multiprocessing import Pool
p = Pool(8)
pmap = p.map
%time OUT1 = txt2m.total_counts(pmap(tkn_maker, POST_STREAM))
CPU times: user 2.86 s, sys: 680 ms, total: 3.54 s Wall time: 34.9 s
%time OUT1 = tlz.pipe(pmap(tkn_maker, POST_STREAM), txt2m.total_counts)
CPU times: user 3.51 s, sys: 907 ms, total: 4.42 s Wall time: 36.8 s
len(OUT1)
789541
sorted(OUT1, key=lambda t: t[1], reverse=True)[250:300]
[(u'problems', 1066), (u'wrong', 1066), (u'enable', 1065), (u'lot', 1065), (u'powered', 1060), (u'won', 1056), (u'long', 1055), (u'download', 1035), (u'best', 1033), (u'info', 1021), (u'service', 1016), (u'similar', 1009), (u'idea', 1003), (u'interfaces', 1003), (u'sound', 997), (u'commands', 996), (u'return', 985), (u'gpio_pins', 984), (u'gui', 983), (u'function', 981), (u'media', 968), (u'include', 965), (u'reading', 964), (u'worked', 961), (u'ports', 961), (u'connecting', 951), (u'signal', 948), (u'configure', 948), (u'light', 947), (u'raspbmc', 946), (u'motion', 946), (u'debug', 938), (u'daemon', 936), (u'free', 935), (u'message', 935), (u'remove', 934), (u'specific', 932), (u'copy', 930), (u'little', 930), (u'post', 922), (u'isn', 919), (u'machine', 919), (u'format', 912), (u'order', 912), (u'standard', 911), (u'correct', 909), (u'load', 907), (u'maybe', 894), (u'fix', 889), (u'cards', 883)]
tkn_maker_uni = tlz.compose(tuple, t2t., r2t.decode_and_fix, r2t.remove_html_bits)
%time OUT2 = txt2m.total_counts(pmap(tkn_maker_uni, POST_STREAM))