https://github.com/JasonKessler/scattertext
Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.
Link to preprint: https://arxiv.org/abs/1703.00565
@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }
Data is from http://followthehashtag.com/datasets/
%matplotlib inline
import io, json
from zipfile import ZipFile
import urllib.request
import pandas as pd
import numpy as np
import agefromname
import nltk
import imp
import scattertext as st
from scattertext import tweet_tokenzier_factory
from scattertext.termranking import OncePerDocFrequencyRanker
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
try:
df = pd.read_csv('usa_tweets.csv.gz')
except:
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
df['first_name'] = df['User Name'].apply(
lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
df['last_name'] = df['User Name'].apply(
lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)
df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df[['first_name', 'last_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:3]
first_name | last_name | User Name | Nickname | Tweet content | |
---|---|---|---|---|---|
0 | bill | schulhoff | Bill Schulhoff | BillSchulhoff | Wind 3.2 mph NNE. Barometer 30.20 in, Rising s... |
1 | daniele | polis | Daniele Polis | danipolis | Pausa pro café antes de embarcar no próximo vô... |
2 | kasey | jacobs | Kasey Jacobs | KJacobs27 | Good. Morning. #morning #Saturday #diner #VT #... |
male_prob = agefromname.AgeFromName().get_all_name_male_prob()
male_prob.iloc[:3]
hi | lo | prob | |
---|---|---|---|
first_name | |||
aaban | 1.000000 | 9.574095e-01 | 1.0 |
aabha | 0.121295 | -1.387779e-17 | 0.0 |
aabid | 1.000000 | 5.628005e-01 | 1.0 |
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
df_mf.to_csv('emoji_data.csv', index=False)
df_mf = pd.read_csv('emoji_data.csv')
df_mf[['gender', 'first_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:6]
gender | first_name | User Name | Nickname | Tweet content | |
---|---|---|---|---|---|
0 | m | bill | Bill Schulhoff | BillSchulhoff | Wind 3.2 mph NNE. Barometer 30.20 in, Rising s... |
1 | m | bill | Bill S Kenney | BillSKenney | Planning the new focuslabllc website with the ... |
2 | m | bill | Bill Pendley | BILLPENDLEY | #bibleverseoftheday @ Bill The Mortgage Guy ... |
3 | m | bill | Bill Culver | rilla6969 | Start Wars Dark Side Challenge race number one... |
4 | m | bill | Bill Esparza | streetgourmetla | Spinach fusilli by @chef_timothy. A pre #mexic... |
5 | m | bill | Bill Meadows | BillMeadows305 | https://t.co/N8E5aTvIIN |
print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts())
print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts()/df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts().sum())
m 28159 f 21844 Name: gender, dtype: int64 m 0.563146 f 0.436854 Name: gender, dtype: float64
df_mf['Tweet content'].iloc[:5]
0 Wind 3.2 mph NNE. Barometer 30.20 in, Rising s... 1 Planning the new focuslabllc website with the ... 2 #bibleverseoftheday @ Bill The Mortgage Guy ... 3 Start Wars Dark Side Challenge race number one... 4 Spinach fusilli by @chef_timothy. A pre #mexic... Name: Tweet content, dtype: object
nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)
corpus = st.CorpusFromParsedDocuments(
df_mf,
parsed_col='parse',
category_col='gender',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()
metadata= (df_mf['User Name']
+ ' (@' + df_mf['Nickname'] + ') '
+ df_mf['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus,
category='f',
category_name='Female',
not_category_name='Male',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=metadata,
width_in_pixels=1000
)
file_name = 'output/emoji_gender_scattertext.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = st.produce_fightin_words_explorer(corpus,
category='f',
category_name='Female',
not_category_name='Male',
term_ranker=OncePerDocFrequencyRanker,
metadata=metadata)
file_name = 'output/emoji_gender_lorp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
From https://www.census.gov/data/developers/data-sets/surnames.2010.html
from urllib.request import urlopen
#url = 'https://api.census.gov/data/2010/surname?get=COUNT,CUM_PROP100K,NAME,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K,RANK,&RANK=1:100000'
url ='https://api.census.gov/data/2010/surname?get=NAME,COUNT,CUM_PROP100K,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K&RANK=1:200000'
with urlopen(url) as f:
raw = f.read().decode('utf-8')
rows = json.loads(raw)
name_df = pd.DataFrame(rows[1:], columns=rows[0]).set_index('NAME').replace('(S)', 0).astype(float).reset_index()
name_df['NAME'] = name_df['NAME'].apply(str.lower)
name_df = name_df.set_index('NAME')
name_df['heritage'] = name_df.apply(lambda x: max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[1][3:]
if max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[0] > 85
else np.nan, axis=1)
name_df['heritage'] = name_df['heritage'].apply(lambda x:
{'AIAN': 'Native American',
'API': 'Asian',
'BLACK': 'African American',
'HISPANIC': 'Hispanic',
'WHITE': 'White'}.get(x, np.nan))
name_df[name_df['PCTHISPANIC'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]
COUNT | CUM_PROP100K | PCT2PRACE | PCTAIAN | PCTAPI | PCTBLACK | PCTHISPANIC | PCTWHITE | PROP100K | RANK | heritage | |
---|---|---|---|---|---|---|---|---|---|---|---|
NAME | |||||||||||
garcia | 1166120.0 | 3400.12 | 0.26 | 0.47 | 1.41 | 0.45 | 92.03 | 5.38 | 395.32 | 6.0 | Hispanic |
rodriguez | 1094924.0 | 4543.50 | 0.18 | 0.18 | 0.57 | 0.54 | 93.77 | 4.75 | 371.19 | 9.0 | Hispanic |
martinez | 1060159.0 | 4902.90 | 0.22 | 0.51 | 0.60 | 0.49 | 92.91 | 5.28 | 359.40 | 10.0 | Hispanic |
hernandez | 1043281.0 | 5256.58 | 0.16 | 0.19 | 0.60 | 0.36 | 94.89 | 3.79 | 353.68 | 11.0 | Hispanic |
lopez | 874523.0 | 5553.05 | 0.25 | 0.38 | 1.02 | 0.57 | 92.92 | 4.86 | 296.47 | 12.0 | Hispanic |
name_df[name_df['PCTBLACK'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]
COUNT | CUM_PROP100K | PCT2PRACE | PCTAIAN | PCTAPI | PCTBLACK | PCTHISPANIC | PCTWHITE | PROP100K | RANK | heritage | |
---|---|---|---|---|---|---|---|---|---|---|---|
NAME | |||||||||||
washington | 177386.0 | 20370.63 | 3.78 | 0.68 | 0.30 | 87.53 | 2.54 | 5.17 | 60.14 | 145.0 | African American |
pierre | 33913.0 | 41272.38 | 2.23 | 0.92 | 0.31 | 86.74 | 2.75 | 7.05 | 11.50 | 1026.0 | African American |
smalls | 12435.0 | 53820.35 | 2.76 | 0.28 | 0.23 | 90.49 | 2.46 | 3.78 | 4.22 | 2888.0 | African American |
jeanbaptiste | 7915.0 | 59139.71 | 2.50 | 0.13 | 0.21 | 94.04 | 2.15 | 0.97 | 2.68 | 4483.0 | African American |
diallo | 7502.0 | 59784.56 | 1.55 | 0.09 | 0.12 | 95.64 | 0.76 | 1.84 | 2.54 | 4730.0 | African American |
name_df.fillna('N/A').groupby('heritage').sum()['COUNT'].sort_values()
heritage Native American 98523.0 African American 1343997.0 Asian 7336493.0 Hispanic 36248570.0 White 92111160.0 N/A 128528485.0 Name: COUNT, dtype: float64
name_df.dropna().groupby('heritage').sum()['COUNT'].sort_values()/name_df.dropna().groupby('heritage').sum()['COUNT'].sum()
heritage Native American 0.000718 African American 0.009800 Asian 0.053497 Hispanic 0.264320 White 0.671664 Name: COUNT, dtype: float64
df_mf_heritage = pd.merge(df_mf, name_df[['heritage']].dropna(), left_on='last_name', right_index = True, how='inner')
df_mf_heritage['heritage'].value_counts()
White 22176 Hispanic 9188 Asian 1249 African American 166 Native American 8 Name: heritage, dtype: int64
df_mf_heritage['Is-White'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'White' else 'Not White')
df_mf_heritage['Is-Hispanic'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Hispanic' else 'Not Hispanic')
df_mf_heritage['Is-Asian'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Asian' else 'Not Asian')
metadata= (df_mf_heritage['User Name']
+ ' (@' + df_mf_heritage['Nickname'] + ') '
+ df_mf_heritage['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df_mf_heritage,
parsed_col='parse',
category_col='Is-White',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='White',
category_name='White',
not_category_name='Not-White',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=metadata,
width_in_pixels=1000,
max_docs_per_category=1000
)
file_name = 'output/emoji_white_v_all.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
metadata= (df_mf_heritage['User Name']
+ ' (@' + df_mf_heritage['Nickname'] + ') '
+ df_mf_heritage['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df_mf_heritage,
parsed_col='parse',
category_col='Is-Hispanic',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='Hispanic',
category_name='Hispanic',
not_category_name='Not-Hispanic',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=metadata,
width_in_pixels=1000,
max_docs_per_category=1000
)
file_name = 'output/emoji_hispanic_v_all.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)