import requests from bs4 import BeautifulSoup TEDX_HOME_URL = "http://tedxtalks.ted.com" LANG_URL = "/browse/talks-by-language/" s = requests.get(TEDX_HOME_URL) soup = BeautifulSoup(s.content) total = 0 link_tags = soup.find_all('a', href=True) for link_tag in link_tags: link = link_tag['href'] lang = link_tag.next_element.next_element.next_element if link.startswith(LANG_URL): print("Language %s: %s" % (lang, link)) total += 1 print("Total: %d" % (total)) import re VIDEO_LINK_PREFIX = "mvp_grid_panel_img_" MSG_CLASS = "mvp_padded_message" EMPTY_PAGE_MSG = "This page is empty." portal_url = "http://tedxtalks.ted.com/browse/talks-by-language/icelandic" page = 1 while(True): # EX: http://tedxtalks.ted.com/browse/talks-by-language/icelandic?page=1 url = portal_url + "?page=" + str(page) print("Reading URL: " + url) s = requests.get(url) soup = BeautifulSoup(s.content) # if there is no Next page #
This page is empty.
msg_tag = soup.find('div', {'class': MSG_CLASS}) if msg_tag and msg_tag.get_text() == EMPTY_PAGE_MSG: print("empty page.") break link_tags = soup.find_all('a', id=re.compile(VIDEO_LINK_PREFIX), href=True) for link_tag in link_tags: link = link_tag['href'] # EX: from /list/search%3Atag%3A%22chinese%22/video/The-tragedy-of-Hong-Kong-Archiv to /video/The-tragedy-of-Hong-Kong-Archiv pos = link.find("/video") link = link[pos:] print("video link: %s (attr Language: Icelandic)" % (link)) page += 1 VIDEO_ID_RE = b""" .* """ url = "http://tedxtalks.ted.com/video/TEDxReykjavik-Eythor-Edvardsson" s = requests.get(url) html = s.content video_ids = re.findall(VIDEO_ID_RE, html, re.IGNORECASE|re.VERBOSE) for video_id in video_ids: # NOTE: byte string => need decode print("YouTube ID: %s (%s)" % (url, video_id.decode('utf-8'))) #https://developers.google.com/youtube/articles/view_youtube_jsonc_responses #https://developers.google.com/youtube/2.0/developers_guide_jsonc import requests import json import time user_id = "tedxtalks" page = 1 maxcount = 25 count = 0 start_index = 0 # Obtaining Total page number s = requests.get("https://gdata.youtube.com/feeds/api/users/"+user_id+"/uploads?v=2&alt=jsonc&start-index=1&max-result=1") data = [json.loads(row) for row in s.content.split("\n") if row] totalcount = data[0]['data']['totalItems'] pagenumber = totalcount/maxcount +1 key = ['id', 'uploaded', 'category', 'title', 'tags', 'thumbnail', 'duration', 'likeCount', 'rating', 'ratingCount', 'viewCount', 'favoriteCount', 'commentCount'] tedx ={'id':'', 'data':{ 'uploaded':'','title':'','tags':'','thumbnail':'','duration':'','likeCount':'','rating':'','ratingCount':'','viewCount':'','favoriteCount':'','commentCount':''} } # Obtaining Data from each page (sample) for index in range(1,2): #range(1,pagenumber): # changing index number if index == 1: start_index = 1 else: start_index = index*maxcount s = requests.get("https://gdata.youtube.com/feeds/api/users/"+user_id+"/uploads?v=2&alt=jsonc&start-index="+str(start_index)+"&max-result="+str(maxcount)) data = [json.loads(row) for row in s.content.split("\n") if row] metadata = data[0]['data']['items'] # obtaining each data in a page (25 items) for i in range(5):#len(metadata)): count +=1 u = metadata[i] #missing key-value pair for j in key: if j=='id': tedx['id']=u['id'] elif j =='thumbnail': tedx['data'][j] = u[j][u'hqDefault'] elif j == 'title': tedx['data'][j] = u[j].encode('utf-8') else: tedx['data'][j] = u[j] if not j in list(set(key) -set(u.keys())) else '-' the_dump = json.dumps(tedx) print the_dump # delay time.sleep(1) # https://developers.google.com/youtube/2.0/developers_guide_jsonc import json SITE_JSON = "tedx_video.json" YOUTUBE_JSON = "tedx_v7.txt" SITE_ATTR_LIST = ['lang', 'event', 'country', 'topic'] # import JSON from TEDx website and make video_dict site_json_file = open(SITE_JSON) site_json = json.load(site_json_file) site_json_file.close() video_dict = {} for video in site_json: vid = site_json[video]['id'] video_dict[vid] = {} for attr in SITE_ATTR_LIST: if attr in site_json[video]: video_dict[vid][attr] = site_json[video][attr] # get JSON from YouTube and print to merged result file merged_cnt = 0 with open(YOUTUBE_JSON, "r") as youtube_json_file: for line in youtube_json_file: if merged_cnt >= 10: break youtube_json = json.loads(line) vid = youtube_json['id'] merged_video = youtube_json['data'] merged_video['id'] = vid if vid in video_dict: attr_cnt = 0 for attr in SITE_ATTR_LIST: if attr in video_dict[vid]: merged_video[attr] = video_dict[vid][attr] attr_cnt += 1 if attr_cnt == 4: print(json.dumps(merged_video)) merged_cnt += 1 import pandas as pd from pandas import Series, DataFrame import json TEDX_JSON_FILE = "final_tedx.json" tedx_video_list = [] with open(TEDX_JSON_FILE, "r") as tedx_json_file: for line in tedx_json_file: tedx_video_list.append(json.loads(line)) tedx_df = DataFrame(tedx_video_list) tedx_df.set_index('id', inplace=True, drop=True) tedx_df tedx_df['lang'].value_counts()[:10] tedx_df[tedx_df.lang!='English']['lang'].value_counts().plot(kind="bar") tedx_df['viewCount'] = tedx_df['viewCount'].fillna(0) tedx_df[tedx_df.viewCount!='-'][['viewCount', 'lang']].groupby('lang').sum().sort('viewCount', ascending=0)[:10] tmp_tedx_df = tedx_df[tedx_df.lang!='English'].copy() tmp_tedx_df[tmp_tedx_df.viewCount!='-'][['viewCount', 'lang']].groupby('lang').sum().sort('viewCount', ascending=0).plot(kind="bar") tedx_df.groupby('lang').event.nunique().order(ascending=False)[:10] tmp_tedx_df = tedx_df[tedx_df.lang!='English'].copy() tmp_tedx_df.groupby('lang').event.nunique().order(ascending=False).plot(kind="bar") tedx_df['country'].value_counts()[:10] tedx_df[tedx_df.viewCount!='-'][['viewCount', 'country']].groupby('country').sum().sort('viewCount', ascending=0)[:10] tedx_df.groupby('country').event.nunique().order(ascending=False)[:10] # more information for visualization, including how to prepare data for D3.js # http://nbviewer.ipython.org/5501063 from IPython.display import HTML HTML('') # you can navigate years by sliding 'year' digits from IPython.display import HTML HTML('') from pandas import read_csv from urllib import urlopen from pandas import Series, DataFrame page = urlopen("list of languages by number of native speaker.csv") df = read_csv(page) df.set_index('Language',inplace=True,drop=True) df[:30].plot(kind="bar")