import requests
from bs4 import BeautifulSoup
TEDX_HOME_URL = "http://tedxtalks.ted.com"
LANG_URL = "/browse/talks-by-language/"
s = requests.get(TEDX_HOME_URL)
soup = BeautifulSoup(s.content)
total = 0
link_tags = soup.find_all('a', href=True)
for link_tag in link_tags:
link = link_tag['href']
lang = link_tag.next_element.next_element.next_element
if link.startswith(LANG_URL):
print("Language %s: %s" % (lang, link))
total += 1
print("Total: %d" % (total))
import re
VIDEO_LINK_PREFIX = "mvp_grid_panel_img_"
MSG_CLASS = "mvp_padded_message"
EMPTY_PAGE_MSG = "This page is empty."
portal_url = "http://tedxtalks.ted.com/browse/talks-by-language/icelandic"
page = 1
while(True):
# EX: http://tedxtalks.ted.com/browse/talks-by-language/icelandic?page=1
url = portal_url + "?page=" + str(page)
print("Reading URL: " + url)
s = requests.get(url)
soup = BeautifulSoup(s.content)
# if there is no Next page
#
This page is empty.
msg_tag = soup.find('div', {'class': MSG_CLASS})
if msg_tag and msg_tag.get_text() == EMPTY_PAGE_MSG:
print("empty page.")
break
link_tags = soup.find_all('a', id=re.compile(VIDEO_LINK_PREFIX), href=True)
for link_tag in link_tags:
link = link_tag['href']
# EX: from /list/search%3Atag%3A%22chinese%22/video/The-tragedy-of-Hong-Kong-Archiv to /video/The-tragedy-of-Hong-Kong-Archiv
pos = link.find("/video")
link = link[pos:]
print("video link: %s (attr Language: Icelandic)" % (link))
page += 1
VIDEO_ID_RE = b"""
.*
"""
url = "http://tedxtalks.ted.com/video/TEDxReykjavik-Eythor-Edvardsson"
s = requests.get(url)
html = s.content
video_ids = re.findall(VIDEO_ID_RE, html, re.IGNORECASE|re.VERBOSE)
for video_id in video_ids:
# NOTE: byte string => need decode
print("YouTube ID: %s (%s)" % (url, video_id.decode('utf-8')))
#https://developers.google.com/youtube/articles/view_youtube_jsonc_responses
#https://developers.google.com/youtube/2.0/developers_guide_jsonc
import requests
import json
import time
user_id = "tedxtalks"
page = 1
maxcount = 25
count = 0
start_index = 0
# Obtaining Total page number
s = requests.get("https://gdata.youtube.com/feeds/api/users/"+user_id+"/uploads?v=2&alt=jsonc&start-index=1&max-result=1")
data = [json.loads(row) for row in s.content.split("\n") if row]
totalcount = data[0]['data']['totalItems']
pagenumber = totalcount/maxcount +1
key = ['id', 'uploaded', 'category', 'title', 'tags', 'thumbnail', 'duration', 'likeCount', 'rating', 'ratingCount', 'viewCount', 'favoriteCount', 'commentCount']
tedx ={'id':'',
'data':{ 'uploaded':'','title':'','tags':'','thumbnail':'','duration':'','likeCount':'','rating':'','ratingCount':'','viewCount':'','favoriteCount':'','commentCount':''}
}
# Obtaining Data from each page (sample)
for index in range(1,2): #range(1,pagenumber):
# changing index number
if index == 1:
start_index = 1
else:
start_index = index*maxcount
s = requests.get("https://gdata.youtube.com/feeds/api/users/"+user_id+"/uploads?v=2&alt=jsonc&start-index="+str(start_index)+"&max-result="+str(maxcount))
data = [json.loads(row) for row in s.content.split("\n") if row]
metadata = data[0]['data']['items']
# obtaining each data in a page (25 items)
for i in range(5):#len(metadata)):
count +=1
u = metadata[i]
#missing key-value pair
for j in key:
if j=='id':
tedx['id']=u['id']
elif j =='thumbnail':
tedx['data'][j] = u[j][u'hqDefault']
elif j == 'title':
tedx['data'][j] = u[j].encode('utf-8')
else:
tedx['data'][j] = u[j] if not j in list(set(key) -set(u.keys())) else '-'
the_dump = json.dumps(tedx)
print the_dump
# delay
time.sleep(1)
# https://developers.google.com/youtube/2.0/developers_guide_jsonc
import json
SITE_JSON = "tedx_video.json"
YOUTUBE_JSON = "tedx_v7.txt"
SITE_ATTR_LIST = ['lang', 'event', 'country', 'topic']
# import JSON from TEDx website and make video_dict
site_json_file = open(SITE_JSON)
site_json = json.load(site_json_file)
site_json_file.close()
video_dict = {}
for video in site_json:
vid = site_json[video]['id']
video_dict[vid] = {}
for attr in SITE_ATTR_LIST:
if attr in site_json[video]:
video_dict[vid][attr] = site_json[video][attr]
# get JSON from YouTube and print to merged result file
merged_cnt = 0
with open(YOUTUBE_JSON, "r") as youtube_json_file:
for line in youtube_json_file:
if merged_cnt >= 10:
break
youtube_json = json.loads(line)
vid = youtube_json['id']
merged_video = youtube_json['data']
merged_video['id'] = vid
if vid in video_dict:
attr_cnt = 0
for attr in SITE_ATTR_LIST:
if attr in video_dict[vid]:
merged_video[attr] = video_dict[vid][attr]
attr_cnt += 1
if attr_cnt == 4:
print(json.dumps(merged_video))
merged_cnt += 1
import pandas as pd
from pandas import Series, DataFrame
import json
TEDX_JSON_FILE = "final_tedx.json"
tedx_video_list = []
with open(TEDX_JSON_FILE, "r") as tedx_json_file:
for line in tedx_json_file:
tedx_video_list.append(json.loads(line))
tedx_df = DataFrame(tedx_video_list)
tedx_df.set_index('id', inplace=True, drop=True)
tedx_df
tedx_df['lang'].value_counts()[:10]
tedx_df[tedx_df.lang!='English']['lang'].value_counts().plot(kind="bar")
tedx_df['viewCount'] = tedx_df['viewCount'].fillna(0)
tedx_df[tedx_df.viewCount!='-'][['viewCount', 'lang']].groupby('lang').sum().sort('viewCount', ascending=0)[:10]
tmp_tedx_df = tedx_df[tedx_df.lang!='English'].copy()
tmp_tedx_df[tmp_tedx_df.viewCount!='-'][['viewCount', 'lang']].groupby('lang').sum().sort('viewCount', ascending=0).plot(kind="bar")
tedx_df.groupby('lang').event.nunique().order(ascending=False)[:10]
tmp_tedx_df = tedx_df[tedx_df.lang!='English'].copy()
tmp_tedx_df.groupby('lang').event.nunique().order(ascending=False).plot(kind="bar")
tedx_df['country'].value_counts()[:10]
tedx_df[tedx_df.viewCount!='-'][['viewCount', 'country']].groupby('country').sum().sort('viewCount', ascending=0)[:10]
tedx_df.groupby('country').event.nunique().order(ascending=False)[:10]
# more information for visualization, including how to prepare data for D3.js
# http://nbviewer.ipython.org/5501063
from IPython.display import HTML
HTML('')
# you can navigate years by sliding 'year' digits
from IPython.display import HTML
HTML('')
from pandas import read_csv
from urllib import urlopen
from pandas import Series, DataFrame
page = urlopen("list of languages by number of native speaker.csv")
df = read_csv(page)
df.set_index('Language',inplace=True,drop=True)
df[:30].plot(kind="bar")