%matplotlib inline import matplotlib.pyplot as plt from bs4 import BeautifulSoup import urllib import json from collections import namedtuple import numpy as np import sqlite3 def create_db(): conn = sqlite3.connect('nfl_teams.db') c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS players (number int, name text, pos text, height int, weight int, dob text, team text)''') conn.commit() conn.close() #Use teams.txt to get the team abbreviations and cities: all_city = [] all_abbr = [] all_name = [] with open("teams.txt","r") as f: for line in f: line = line.rstrip() line = line.split(" - ") all_city.append(line[1]) all_abbr.append(line[0].rstrip()) all_name.append(line[2].title()) def feet2inches(instr): """ Convert a string in f the form X'Y" into an integer number of inches""" instr = "".join([c for c in instr if c not in ('"',' ')]) instr = instr.split("'") instr = [int(c) for c in instr] return int(12*instr[0] + instr[1]) players = [] create_db() url_base = "http://www.nfl.com/teams/roster?team=" for abbr, city, name in zip(all_abbr,all_city,all_name): print "Parsing team : ",city,name url = url_base+abbr soup = BeautifulSoup(urllib.urlopen(url) ) for row in soup.findAll('table')[1].tbody.findAll('tr'): try: number = int(row.findAll('td')[0].contents[0]) name = str(row.findAll('td')[1].a.contents[0]) pos = str(row.findAll('td')[2].contents[0]) height = feet2inches(str(row.findAll('td')[4].contents[0])) weight = int(row.findAll('td')[5].contents[0]) dob = str(row.findAll('td')[6].contents[0]) player_record = (number,name,pos,height,weight,dob,abbr) conn = sqlite3.connect('nfl_teams.db') c = conn.cursor() c.execute("INSERT INTO players VALUES (?,?,?,?,?,?,?)",player_record) conn.commit() conn.close() players.append(player_record) except: pass print " --done."