import pandas as pd
import numpy as np
import wikipedia
import networkx as nx
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
from collections import Counter
import wordcloud
warnings.filterwarnings("ignore")
%matplotlib inline
def get_top(data, sort_by, gender="all", top_x=10):
""" Returns top_x rows of dataframe sorted by column {sorted_by}
"""
if gender == "female":
data = data[data["gender"] == "female"]
if gender == "male":
data = data[data["gender"] == "male"]
return data.sort_values(sort_by,ascending=False).head(top_x)[["WikiURL", sort_by]]
def distribution_plot(data, column, gender="all", x_scale="linear", y_scale="linear"):
""" Plots a distribution, for specified column, with the possibility of filtering by gender
"""
if gender != "all":
data = data[data["gender"] == gender]
grid = sns.distplot(data[column])
# grid.set(xscale="log")
def flatten(data, col):
f_list = [item for sublist in data[col].values for item in sublist]
return f_list
def wordcloud_col(data, gender = "female", col = "occupation"):
""" Plots a wordcloud from text contained in specified column
"""
if gender == "female":
data = data[data["gender"] == "female"]
if gender == "male":
data = data[data["gender"] == "male"]
lst = flatten(data, col)
if col == "occupation":
lst = [i for i in lst if i != "politician"]
counts = Counter(lst)
wc = wordcloud.WordCloud(scale=1, width=800, height=400, background_color="white").generate_from_frequencies(counts)
plt.figure( figsize=(16,8))
plt.imshow(wc)
plt.axis("off")
plt.show()
def bar_col(data, gender="female", col = "party"):
""" Plots a bar chart for values in specified column
"""
if gender == "female":
data = data[data["gender"] == "female"]
if gender == "male":
data = data[data["gender"] == "male"]
lst = flatten(data, col)
if col == "occupation":
lst = [i for i in lst if i != "politician"]
counts = Counter(lst).most_common(10)
if col == "nationality":
counts = Counter(lst).most_common(11)
counts = counts[1:]
df = pd.DataFrame(counts, columns=['label', 'value'])
sns.barplot(df["value"], df["label"])
def word_cloud_txt(df):
wc = wordcloud.WordCloud(width=800, height=400).generate(" ".join(df["page_content"].values))
plt.figure( figsize=(16,8))
plt.imshow(wc)
plt.axis("off")
plt.show()
def plot_gender_over_time(data):
""" Plots how many items show up over time and colours them differently by gender
"""
lst = []
for year in sorted(data["entered"].unique()):
df = data[data["entered"] == year]
counts = df["gender"].value_counts().to_dict()
if "female" in counts.keys():
lst.append([year, counts["male"], counts["female"]])
else:
lst.append([year, counts["male"], 0])
df = pd.DataFrame(lst, columns = ["year", "males", "females"])
df = pd.melt(df, id_vars=["year"], value_vars=["males", "females"])
# print(df)
g = sns.factorplot(x="year", y="value", hue='variable', data=df)
g.set_xticklabels(rotation=90)
def add_page_content(df,gender):
""" Returns contents of wikipedia pages from the dataframe
"""
df = df[df["gender"] == gender]
df["page_content"] = df["WikiURL"].apply(lambda x: page_content(x))
return df
def page_content(url):
try:
return wikipedia.page(url.split("/")[-1]).content.replace("\n","").replace("==", "")
except:
return ""
def add_doc_length(df):
df["n_terms"] = df["page_content"].apply(lambda x: len(x))
return df
def print_wikipage_gist(page_name):
""" Fetches specified Wiki page and prints, title, url, summary, and first 10 img links
"""
w = wikipedia.page(page_name)
print("Title: {} \n".format(w.title))
print("URL: {} \n".format(w.url))
print("Page Summary: \n {} \n".format(w.summary))
print("Links to first 10 images: {} \n".format(w.images[:10]))
def filter_graph(G, attribute, value):
""" Returns a filtered subgraph containing nodes with specified attribute
Input parameters:
1. G - graph
2. attribute - filter by this attribute: {gender, occupation, party, nationality}
3. value - filter by this value, for example 'female' in the case of gender
"""
if attribute == "gender":
return G.subgraph( [n for n,attrdict in G.node.items() if attrdict['gender'] == value] )
else:
return G.subgraph( [n for n,attrdict in G.node.items() if value in attrdict[attribute]] )
def plot_graph(data_source = "data/2016_12_dir_dir", gender="all"):
graph = nx.read_gpickle(data_source)
if gender != "all":
fg = filter_graph(graph, "gender", gender)
plt.figure(figsize=(16, 12))
nx.draw(fg,edge_color='b')
else:
plt.figure(figsize=(16, 12))
nx.draw(graph,edge_color='b')
First of all, let's load the data about German politicians on Wikipedia
data = pd.read_pickle("data/german_politicians")
data.head()
WikiURL | ID | gender | name | party | nationality | entered | occupation | out_degree | in_degree | eig_central | views | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | http://en.wikipedia.org/wiki/Otto_Hoetzsch | 427 | male | [ hoetzsch otto ] | [] | [german] | 2011 | [politician] | 1 | 0 | 2.021104e-19 | 1652.0 |
113 | http://en.wikipedia.org/wiki/Adolf_Grimme | 4123 | male | [ adolf grimme , grimme adolf ] | [] | [german] | 2012 | [politician] | 3 | 2 | 2.530002e-05 | 2376.0 |
127 | http://en.wikipedia.org/wiki/Heinz_Jost | 4787 | male | [ jost heinz ] | [] | [german] | 2009 | [nazism, politician] | 4 | 0 | -1.456437e-19 | 32143.0 |
132 | http://en.wikipedia.org/wiki/Heinz_Neukirchen | 4960 | male | [ heinz neukirchen , neukirchen heinz ] | [] | [german] | 2009 | [politician] | 1 | 1 | 9.096369e-08 | 2268.0 |
158 | http://en.wikipedia.org/wiki/Wilhelm_von_Bismarck | 5619 | male | [ wilhelm , wilhelm otto albrecht von bismarc... | [] | [german] | 2011 | [politician] | 2 | 2 | 7.461692e-04 | 19536.0 |
We have the following information:
data.gender.value_counts()
male 1136 female 171 Name: gender, dtype: int64
sns.countplot(x="gender", data=data)
<matplotlib.axes._subplots.AxesSubplot at 0xb5bcef0>
This plot shows how many new politicians (male or female) joined wikipedia every year since 2001 until 2016
plot_gender_over_time(data)
Politicians also have proffesional occupations other than politics, here is a word cloud of them
wordcloud_col(data, gender="female")
wordcloud_col(data, gender="male")
Let's see what professions are most popular among female politicians
bar_col(data, gender="female", col="occupation")
Let's see what professions are most popular among male politicians
bar_col(data, gender="male", col="occupation")
Let's see what political parties are most popular among male politicians
bar_col(data, gender="male", col="party")
Let's see what political parties are most popular among female politicians
bar_col(data, gender="female", col="party")
Let's see what political parties are most popular among all (male and female) politicians
bar_col(data, gender="all", col="party")
Let's see the most common second nationalities of politicians
bar_col(data, gender="all", col="nationality")
On Wikipedia articles can link to other articles. We can depict this as network where each node refers to an article about a politician. If the article about politician A links to the article about politician B we draw an edge between A and B. The blue edge with a thick end shows the direction of this relationship.
plot_graph(gender="female")
The measure that shows us who has the most links pointed in their direction is called in-degree
get_top(data, "in_degree", gender="female", top_x=5)
WikiURL | in_degree | |
---|---|---|
262 | http://en.wikipedia.org/wiki/Angela_Merkel | 205 |
5490 | http://en.wikipedia.org/wiki/Rosa_Luxemburg | 53 |
16937 | http://en.wikipedia.org/wiki/Hannelore_Kraft | 20 |
19808 | http://en.wikipedia.org/wiki/Ursula_von_der_Leyen | 19 |
25919 | http://en.wikipedia.org/wiki/Clara_Zetkin | 16 |
# sns.countplot(x="gender", data=data)
distribution_plot(data, gender="female",column="in_degree")
The measure that shows us who has the most links pointing to other politicians is called out-degree
get_top(data, "out_degree", gender="female", top_x=5)
WikiURL | out_degree | |
---|---|---|
262 | http://en.wikipedia.org/wiki/Angela_Merkel | 51 |
25953 | http://en.wikipedia.org/wiki/Sabine_Leutheusse... | 24 |
13584 | http://en.wikipedia.org/wiki/Heidemarie_Wieczo... | 24 |
16937 | http://en.wikipedia.org/wiki/Hannelore_Kraft | 20 |
5490 | http://en.wikipedia.org/wiki/Rosa_Luxemburg | 19 |
distribution_plot(data, gender="female",column="out_degree")
Who are the most popular German female politicians on Wikipedia?
get_top(data, "views", gender="female",top_x=5)
WikiURL | views | |
---|---|---|
262 | http://en.wikipedia.org/wiki/Angela_Merkel | 2404469.0 |
5490 | http://en.wikipedia.org/wiki/Rosa_Luxemburg | 396307.0 |
27562 | http://en.wikipedia.org/wiki/Frauke_Petry | 307116.0 |
19808 | http://en.wikipedia.org/wiki/Ursula_von_der_Leyen | 119953.0 |
14177 | http://en.wikipedia.org/wiki/Ulrike_Meinhof | 113341.0 |
sns.boxplot(data["views"])
<matplotlib.axes._subplots.AxesSubplot at 0xb7baf28>
This plot can show us how one variable influences other variables?
data2 = data.drop(4526) # removing the outlier
data2
sns.pairplot(data2, kind='reg')
<seaborn.axisgrid.PairGrid at 0xb7ec0f0>
sns.lmplot("views","entered",data=data2)
<seaborn.axisgrid.FacetGrid at 0x11d94198>
sns.lmplot("views","in_degree", hue="gender", data=data2)
<seaborn.axisgrid.FacetGrid at 0xf0264e0>
We can fetch data about politicians on Wikipedia relatively easy. We just need to know the name of their web page, and it is the text behind the last '/' symbol. So, for 'https://en.wikipedia.org/wiki/Barack_Obama' it is "Barack_Obama"
wiki_page = wikipedia.page("Barack_Obama")
wiki_page.title
'Barack Obama'
wiki_page.url
'https://en.wikipedia.org/wiki/Barack_Obama'
wiki_page.images[:10]
['https://upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/9/9f/Job_Growth_by_U.S._President_-_v1.png', 'https://upload.wikimedia.org/wikipedia/commons/f/f5/Obama_signs_health_care-20100323.jpg', 'https://upload.wikimedia.org/wikipedia/commons/8/84/Percentage_of_Individuals_in_the_United_States_Without_Health_Insurance%2C_1963-2015.png', 'https://upload.wikimedia.org/wikipedia/commons/7/73/Blue_pencil.svg', 'https://upload.wikimedia.org/wikipedia/commons/b/b8/Barack_Obama_and_Bill_Clinton.jpg', 'https://upload.wikimedia.org/wikipedia/commons/4/4b/Barack_Obama_visiting_victims_of_2012_Aurora_shooting.jpg', 'https://upload.wikimedia.org/wikipedia/commons/f/f0/Seal_of_the_United_States_Senate.svg', 'https://upload.wikimedia.org/wikipedia/commons/a/a2/President_George_W._Bush_and_Barack_Obama_meet_in_Oval_Office.jpg', 'https://upload.wikimedia.org/wikipedia/commons/c/ca/Barack_Obama_playing_basketball_with_members_of_Congress_and_Cabinet_secretaries_2.jpg']
wiki_page.summary
'Barack Hussein Obama II ( ( listen); born August 4, 1961) is an American politician who served as the 44th President of the United States from January 20, 2009 to January 20, 2017. The first African American to assume the presidency, he was previously the junior United States Senator from Illinois from 2005 to 2008. Before that, he served in the Illinois State Senate from 1997 until 2004.\nObama was born in 1961 in Honolulu, Hawaii, two years after the territory was admitted to the Union as the 50th state. Raised largely in Hawaii, Obama also spent one year of his childhood in Washington State and four years in Indonesia. After graduating from Columbia University in New York City in 1983, he worked as a community organizer in Chicago. In 1988 Obama enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. After graduation, he became a civil rights attorney and professor and taught constitutional law at the University of Chicago Law School from 1992 to 2004. Obama represented the 13th District for three terms in the Illinois Senate from 1997 to 2004, when he ran for the U.S. Senate. Obama received national attention in 2004 with his unexpected March primary win, his well-received July Democratic National Convention keynote address, and his landslide November election to the Senate. In 2008, Obama was nominated for president a year after his campaign began and after a close primary campaign against Hillary Clinton. He was elected over Republican John McCain and was inaugurated on January 20, 2009. Nine months later, Obama was named the 2009 Nobel Peace Prize laureate, accepting the award with the caveat that he felt there were others "far more deserving of this honor than I."\nDuring his first two years in office, Obama signed many landmark bills into law. The main reforms were the Patient Protection and Affordable Care Act (often referred to as "Obamacare", shortened as the "Affordable Care Act"), the Dodd–Frank Wall Street Reform and Consumer Protection Act, and the Don\'t Ask, Don\'t Tell Repeal Act of 2010. The American Recovery and Reinvestment Act of 2009 and Tax Relief, Unemployment Insurance Reauthorization, and Job Creation Act of 2010 served as economic stimulus amidst the Great Recession. After a lengthy debate over the national debt limit, Obama signed the Budget Control and the American Taxpayer Relief Acts. In foreign policy, Obama increased U.S. troop levels in Afghanistan, reduced nuclear weapons with the United States–Russia New START treaty, and ended military involvement in the Iraq War. He ordered military involvement in Libya in opposition to Muammar Gaddafi; Gaddafi was killed by NATO-assisted forces, and he also ordered the military operation that resulted in the death of Osama bin Laden.\nAfter winning re-election by defeating Republican opponent Mitt Romney, Obama was sworn in for a second term in 2013. During his second term, Obama promoted inclusiveness for LGBT Americans. His administration filed briefs that urged the Supreme Court to strike down same-sex marriage bans as unconstitutional (United States v. Windsor and Obergefell v. Hodges). Obama advocated for gun control in response to the Sandy Hook Elementary School shooting, and issued wide-ranging executive actions concerning climate change and immigration. In foreign policy, Obama ordered military intervention in Iraq in response to gains made by ISIL after the 2011 withdrawal from Iraq, continued the process of ending U.S. combat operations in Afghanistan, promoted discussions that led to the 2015 Paris Agreement on global climate change, initiated sanctions against Russia following the invasion in Ukraine and again after Russian interference in the 2016 United States elections, brokered a nuclear deal with Iran, and normalized U.S. relations with Cuba. Obama left office in January 2017 with a 60% approval rating and currently resides in Washington, D.C. Since leaving office, his presidency has been favorably ranked by historians and the general public.'
number_char = 200
wiki_page.content[:number_char] #this is yust the first 200 characters
'Barack Hussein Obama II ( ( listen); born August 4, 1961) is an American politician who served as the 44th President of the United States from January 20, 2009 to January 20, 2017. The first African A'
This method will print out the Title, URL, Summary and the first 10 image URLs on the page
print_wikipage_gist("Star_Wars")
Title: Star Wars URL: https://en.wikipedia.org/wiki/Star_Wars Page Summary: Star Wars is an American epic space opera media franchise, centered on a film series created by George Lucas. It depicts the adventures of various characters "a long time ago in a galaxy far, far away". The franchise began in 1977 with the release of the film Star Wars (later subtitled Episode IV: A New Hope in 1981), which became a worldwide pop culture phenomenon. It was followed by the successful sequels The Empire Strikes Back (1980) and Return of the Jedi (1983); these three films constitute the original Star Wars trilogy. A prequel trilogy was released between 1999 and 2005, which received mixed reactions from both critics and fans. A sequel trilogy began in 2015 with the release of Star Wars: The Force Awakens and continued in 2017 with the release of Star Wars: The Last Jedi. The first eight films were nominated for Academy Awards (with wins going to the first two films released) and have been commercial successes, with a combined box office revenue of over US$8.5 billion, making Star Wars the second highest-grossing film series. Spin-off films include the animated Star Wars: The Clone Wars (2008) and Rogue One (2016), the latter of which is the first in a planned series of anthology films. The series has spawned an extensive media franchise including books, television series, computer and video games, theme park attractions and lands, and comic books, resulting in significant development of the series' fictional universe. Star Wars holds a Guinness World Records title for the "Most successful film merchandising franchise". In 2015, the total value of the Star Wars franchise was estimated at US$42 billion, making Star Wars the second-highest-grossing media franchise of all time. In 2012, The Walt Disney Company bought Lucasfilm for US$4.06 billion and earned the distribution rights to all subsequent Star Wars films, beginning with the release of The Force Awakens in 2015. The former distributor, 20th Century Fox, was to retain the physical distribution rights for the first two Star Wars trilogies, was to own permanent rights for the original 1977 film and was to continue to hold the rights for the prequel trilogy and the first two sequels to A New Hope until May 2020. Walt Disney Studios currently owns digital distribution rights to all the Star Wars films, excluding A New Hope. On December 14, 2017, the Walt Disney Company announced its pending acquisition of 21st Century Fox, including the film studio and all distribution rights to A New Hope. Links to first 10 images: ['https://upload.wikimedia.org/wikipedia/commons/f/fa/Wikiquote-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/9/94/SDCC_2015_-_Carrie_Fisher%2C_Mark_Hamill_%26_Harrison_Ford_%2819060574883%29.jpg', 'https://upload.wikimedia.org/wikipedia/en/4/48/Folder_Hexagonal_Icon.svg', 'https://upload.wikimedia.org/wikipedia/commons/2/23/The_Emperor_Has_No_Robes.jpg', 'https://upload.wikimedia.org/wikipedia/commons/3/36/Hasbro_4c_no_R.png', 'https://upload.wikimedia.org/wikipedia/commons/1/17/Star_Wars_Galaxy_Map_KOTOR_Quest.jpg', 'https://upload.wikimedia.org/wikipedia/commons/3/34/Frank_Oz_2012.jpg', 'https://upload.wikimedia.org/wikipedia/commons/e/ec/Boba_Fett.jpg', 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/f/fc/Padlock-silver.svg']
We have fetched the content of ploitician pages already, in order to save time as this can take a few minutes. Let's load this data:
f_content = pd.read_pickle("data/ctnt_f_politicians")
m_content = pd.read_pickle("data/ctnt_m_politicians")
f_content.head()
WikiURL | gender | name | party | nationality | entered | occupation | out_degree | in_degree | eig_central | views | page_content | n_terms | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
262 | http://en.wikipedia.org/wiki/Angela_Merkel | female | [ angela merkel ] | [ democratic awakening , christian democratic... | [german] | 2002 | [politician, chemist, scientist] | 51 | 205 | 5.054432e-02 | 2404469.0 | Angela Dorothea Merkel (; German: [aŋˈɡeːla ˈm... | 40200 |
575 | http://en.wikipedia.org/wiki/Vicky_Leandros | female | [ vicky leandros ] | [] | [german] | 2008 | [politician] | 0 | 1 | -2.036547e-19 | 59260.0 | Vassiliki Papathanasiou (bornGreek: Βασιλική Π... | 13222 |
926 | http://en.wikipedia.org/wiki/Marieluise_Beck | female | [ marieluise beck ] | [ the greens ] | [german] | 2014 | [politician] | 13 | 4 | 1.044176e-05 | 3098.0 | Marieluise Beck (born 25 June 1952 in Bramsche... | 6446 |
1138 | http://en.wikipedia.org/wiki/Annkathrin_Kammeyer | female | [ annkathrin kammeyer , kammeyer annkathrin ] | [ social democratic party of germany ] | [german] | 2012 | [politician] | 1 | 0 | -1.044400e-19 | 1340.0 | Annkathrin Kammeyer (born 12 January 1990) is ... | 1241 |
1408 | http://en.wikipedia.org/wiki/Britta_Böhler | female | [ britta böhler , bohler britta ] | [ groenlinks ] | [german] | 2005 | [politician] | 5 | 1 | 3.184073e-06 | 4320.0 | Britta Böhler (17 July 1960 in Freiburg im Bre... | 5007 |
word_cloud_txt(f_content)
word_cloud_txt(m_content)
m_content["n_terms"].describe()
count 1136.000000 mean 6356.247359 std 11212.024380 min 0.000000 25% 1674.500000 50% 3461.500000 75% 6655.250000 max 140284.000000 Name: n_terms, dtype: float64
sns.distplot(m_content["n_terms"])
<matplotlib.axes._subplots.AxesSubplot at 0xed2e518>
f_content["n_terms"].describe()
count 171.000000 mean 4784.608187 std 6029.411471 min 0.000000 25% 1481.500000 50% 3139.000000 75% 6124.500000 max 46102.000000 Name: n_terms, dtype: float64
sns.distplot(f_content["n_terms"])
<matplotlib.axes._subplots.AxesSubplot at 0xed10048>