import pandas as pd # 1st file data = pd.read_csv('TAGS - DLD14 - Archive.csv', parse_dates={'Timestamp': ['created_at']}) # 2nd file data = data.append(pd.read_csv('TAGS DLD14.2 - Archive.csv', parse_dates={'Timestamp': ['created_at']})) # 3rd file etc. data = data.append(pd.read_csv('TAGS DLD14.3 - Archive.csv', parse_dates={'Timestamp': ['created_at']})) data = data.append(pd.read_csv('TAGS DLD14.4 - Archive.csv', parse_dates={'Timestamp': ['created_at']})) data = data.append(pd.read_csv('TAGS DLD14.5 - Archive.csv', parse_dates={'Timestamp': ['created_at']})) data_old = pd.read_csv("dld13.csv", sep=",", parse_dates={'Timestamp': ['created_at']}) from collections import Counter c = Counter(data.id_str) data[data.id_str == c.most_common()[1][0]][:3] data = data[data['Timestamp'] >= '2014-01-10'] tweets = {} # Reduce data frame tweets['2014'] = data[['Timestamp', 'id_str', 'from_user', 'source', 'text']] tweets['2013'] = data_old[['Timestamp', 'id_str', 'from_user', 'source', 'text']] for year in ['2013', '2014']: # De-dup tweets[year] = tweets[year].drop_duplicates() # Set Timestamp as index for the DataFrame tweets[year] = tweets[year].set_index('Timestamp') tweets['2014']['source'][tweets['2014']['source'] == 'web'] = 'web' tweets['2013']['source'] = [x.replace('<', '<').replace('>', '>').replace('"', '"') for x in tweets['2013']['source']] for year in ['2013', '2014']: tweets[year]['source'] = [x.replace(' ', ' ') for x in tweets[year]['source']] ticks = {} for year in ['2013', '2014']: tweets[year]['Tweets'] = 1 ticks[year] = tweets[year].ix[:, ['Tweets']] ticks[year] = ticks[year].Tweets.resample('15min', how='count') %matplotlib inline import matplotlib.pyplot as plt #plt.figure(figsize=(10,6)) plt.title('DLD14 Conference Buzz') plt.ylabel('Number of Tweets') ticks["2014"].ix['2014-01-18':'2014-01-22'].plot() plt.savefig('DLD14_Buzz.png') print max(ticks["2013"]) print max(ticks["2014"]) ticks["2014o"] = ticks["2014"] * 1.0 ticks["2013o"] = ticks["2013"].tshift(364, freq="d") fig, ax = plt.subplots() ticks["2013o"].ix['2014-01-18':'2014-01-23'].plot(color="red", label="DLD 2013") ticks["2014o"].ix['2014-01-18':'2014-01-23'].plot(label="DLD 2014") legend = ax.legend(loc='upper left', shadow=True) plt.xlabel('Date') plt.title('#DLD14 Conference Buzz') plt.ylabel('Number of Tweets') #plt.show() plt.savefig('DLD14_Buzz_Comp_Comparison.png') from prettytable import PrettyTable import re import prettytable devices = {} for y in tweets: devices[y] = tweets[y].groupby("source", as_index=False)["source", "Tweets"].sum() dev_names = [] dev_count = [] for x in devices["2014"]["source"]: m = re.match("<.*>(.*)", x) try: dev_names.append(m.group(1)) dev_count.extend(devices["2014"]["Tweets"][devices["2014"]["source"] == x]) except AttributeError: pass d = pd.DataFrame({"Device": dev_names, "Tweets_2014": dev_count}) d["Rel_2014"] = 100.0 * d["Tweets_2014"] / sum(d["Tweets_2014"]) dev_names = [] dev_count = [] for x in devices["2013"]["source"]: m = re.match("<.*>(.*)", x) try: dev_names.append(m.group(1)) dev_count.extend(devices["2013"]["Tweets"][devices["2013"]["source"] == x]) except AttributeError: pass e = pd.DataFrame({"Device": dev_names, "Tweets_2013": dev_count}) e["Rel_2013"] = 100.0 * e["Tweets_2013"] / sum(e["Tweets_2013"]) f = pd.merge(d, e, how="outer", on="Device") f["Growth"] = f.Rel_2014 / f.Rel_2013 tb = f[(f['Rel_2014'] >= 0.05) & (f['Growth'] >= 0)].sort(columns="Growth", ascending=False)[["Device", "Rel_2013", "Rel_2014", "Growth"]] tb.columns = ["Device", "% 2013", "% 2014", "Index"] tb["% 2014"] = tb["% 2014"].round(2) tb["% 2013"] = tb["% 2013"].round(2) tb["Index"] = (100*tb["Index"]).round(1) pt = PrettyTable(field_names=["Twitter Clients #DLD14", "Percent 2013", "Percent 2014", "Index"]) [pt.add_row(a[1]) for a in tb.iterrows()] pt.align[pt.align['Twitter Clients #DLD14'], pt.align['Percent 2013'], pt.align['Percent 2014'], pt.align['Index']] = 'l', 'r', 'r', 'r' print pt import nltk bigram_measures = nltk.collocations.BigramAssocMeasures() stop = nltk.corpus.stopwords.words('english') stop = stop + nltk.corpus.stopwords.words('german') text = {} words = {} for year in tweets: raw = " ".join(tweets[year]["text"]) tokens = nltk.WordPunctTokenizer().tokenize(raw) text[year] = nltk.Text(tokens) words[year] = [w.lower() for w in text[year]] words[year] = [w for w in words[year] if len(w) > 2] words[year] = [w for w in words[year] if w not in stop] words[year] = filter(lambda word: word not in '"\'!%,-:()$\/;?.’–“”#@&', words[year]) words[year] = [w for w in words[year] if w not in ["://", "http", "co", "rt", "va", "l", "se", "...", ".\"", "amp", "us", "en", "el", "y", "de", "que", "via", "12", "000", "hoy", "por", "les", "per", "la", "los", "5", "1", ".@", "con"]] words[year] = [w.replace("\xe2", "") for w in words[year]] words[year] = [w.replace("\xc3", "") for w in words[year]] words[year] = [w.replace("\xb3", "") for w in words[year]] numwords = {} uniwords = {} lexi = {} for year in text: numwords[year] = len(text[year]) uniwords[year] = len(set(text[year])) lexi[year] = 1.0*numwords[year]/uniwords[year] print numwords print uniwords print lexi from prettytable import PrettyTable import codecs for year in numwords: freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year] for year in ["2014"]: freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)] tb = freq_table[freq_table['Perc_' + str(year)] >= 0.09].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]] tb.columns = ["Word", "Freq", "Percent", "Index"] tb.Index = tb['Index'].round(1) tb.Percent = tb['Percent'].round(4) pt = PrettyTable(field_names=[str(year), 'Frequency', 'Percent', "Index"]) [pt.add_row(a[1]) for a in tb[:25].iterrows()] pt.align[str(year)], pt.align['Frequency'], pt.align['Percent'], pt.align['Index'] = 'l', 'r', 'r', 'r' print pt tb = freq_table.sort(columns="Perc_2014", ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]] tb.columns = ["Word", "Freq", "Percent", "Index"] tb.Index = tb['Index'].round(1) tb.Percent = tb['Percent'].round(4) pt = PrettyTable(field_names=["Top 2014", 'Frequency', 'Percent', "Index"]) [pt.add_row(a[1]) for a in tb[:25].iterrows()] pt.align["Top 2014"], pt.align['Frequency'], pt.align['Percent'], pt.align['Index'] = 'l', 'r', 'r', 'r' print pt tb = freq_table.sort(columns="Perc_2013", ascending=False)[["Word", "Freq_2013", "Perc_2013"]] tb.columns = ["Word", "Freq", "Percent"] tb.Percent = tb['Percent'].round(4) pt = PrettyTable(field_names=["Top 2013", 'Frequency', 'Percent']) [pt.add_row(a[1]) for a in tb[:25].iterrows()] pt.align["Top 2013"], pt.align['Frequency'], pt.align['Percent'] = 'l', 'r', 'r' print pt from prettytable import PrettyTable from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() for year in ["2013", "2014"]: words[year] = [w.replace("\x80", "") for w in words[year]] words[year] = [w.replace("\x99", "") for w in words[year]] words[year] = [w for w in words[year] if w not in ["como", "oro", "las", "nadie", "cmo", "todos", "hablan", "una", "hacerlo", "sabe", ")", "todo", "decidir", "slo", "adida"]] print "Top Bigrams " + str(year) finder = BigramCollocationFinder.from_words(words[year]) scored = finder.score_ngrams(bigram_measures.raw_freq) pt = PrettyTable(field_names=['Bigram', 'Frequency']) [ pt.add_row([" ".join(kv[0]), round(kv[1], 4)]) for kv in scored[:35] ] pt.align['Bigram'], pt.align['Frequency'] = 'l', 'r' # Set column alignment print pt text["2014"].concordance("xenon") query = ["xenon", "wales", "data", "rovio"] col = ["red", "blue", "green", "black"] data = tweets["2014"] data['text'] = data['text'].apply(str.lower) results = {} fig, ax = plt.subplots() for q in range(len(query)): results[q] = data[data["text"].str.contains(query[q])].ix[:, ['Tweets']] results[q] = results[q].Tweets.resample('30min', how='count') results[q].ix['2014-01-19':'2014-01-22'].plot(color=col[q], label=query[q]) legend = ax.legend(loc='upper right', shadow=True) plt.xlabel('Date') plt.title('#DLD14 Conference Buzz for ' + ", ".join(query)) plt.ylabel('Number of Tweets')