In [1]:
import cookielib
import urllib2
import urllib
import time
from BeautifulSoup import BeautifulSoup
import re
In [24]:
def setup_connection(username = None, password = None):
    # setup credentials and url requests
    # cookie storage
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(
    urllib2.HTTPCookieProcessor(cj),
    urllib2.HTTPRedirectHandler
    )
    # Useragent
    opener.addheaders.append(('User-agent','Mozilla/4.0'))

    url = 'http://www.okcupid.com/login'
    login_data = urllib.urlencode({
    'username':username,
    'password':password,
    })

    req = urllib2.Request(url,login_data)
    resp = opener.open(req)
    return opener
In [3]:
#once a set of usernames is identified, we will add them to our collection
def process_names(userNameTags):
    userNames = [str(tag.contents[0]) for tag in userNameTags]
    
    return set(userNames)
    
In [19]:
def scrape_OKC(iters=1, opener = None):
    userSet = set()

    wideOpenSearch = 'http://www.okcupid.com/match?filter1=0,63&filter2=76,4095&filter3=2,25,60&filter4=5,31536000&filter5=1,1&locid=0&timekey=1&matchOrderBy=SPECIAL_BLEND&custom_search=0&fromWhoOnline=0&mygender=f&update_prefs=1&sort_type=0&sa=1&using_saved_search=&count=18'
    i = 0
    while i < iters:
        try:
            req = urllib2.Request(wideOpenSearch)
            resp = opener.open(req)
            soup = BeautifulSoup(resp)

            userNameTags = soup.findAll('a', attrs={'class': 'name'})
            userSet.update(process_names(userNameTags))
            #print "Request %d finished." % i
        except:
            #print "ERROR ON %d"%i
            continue

        i = i+1
        time.sleep(3) # don't pound the OKCupid servers
    return userSet
In [20]:
#parse page for each user in userSet
def parse_user(userSet):
    userInfo = []
    for user in userSet:
        
        userURL = r"http://www.okcupid.com/profile/" + user + "?cf=regular"
        
        try:
            req = urllib2.Request(userURL)
            resp = opener.open(req)
            soup = BeautifulSoup(resp)

            #get info per user, from the soup
            age = soup.find('span', attrs={"id":"ajax_age"}).text
            location = soup.find('span', attrs={"id":"ajax_location"}).text
            gender = soup.find('span', attrs={"class":"ajax_gender"}).text
            desired_gender = soup.find('li', attrs={"id":"ajax_gentation"}).text
            desired_ages = soup.find('li', attrs={"id":"ajax_ages"}).text
            desired_ages = re.sub('&ndash;', ':', desired_ages)
            desired_distance = soup.find('li', attrs={"id":"ajax_near"}).text
            desired_single = soup.find('li', attrs={"id":"ajax_single"}).text
            desired_relationship_status = soup.find('li', attrs={"id":"ajax_lookingfor"}).text
            orientation = soup.find('dd', attrs={"id":"ajax_orientation"}).text
            ethnicity = soup.find('dd', attrs={"id":"ajax_ethnicities"}).text
            height = soup.find('dd', attrs={"id":"ajax_height"}).text
            body_type = soup.find('dd', attrs={"id":"ajax_bodytype"}).text
            diet = soup.find('dd', attrs={"id":"ajax_diet"}).text
            smokes = soup.find('dd', attrs={"id":"ajax_smoking"}).text
            drinks = soup.find('dd', attrs={"id":"ajax_drinking"}).text
            drugs = soup.find('dd', attrs={"id":"ajax_drugs"}).text
            religion = soup.find('dd', attrs={"id":"ajax_religion"}).text
            education = soup.find('dd', attrs={"id":"ajax_education"}).text
            job = soup.find('dd', attrs={"id":"ajax_job"}).text
            income = soup.find('dd', attrs={"id":"ajax_income"}).text
            relationship_status = soup.find('dd', attrs={"id":"ajax_status"}).text
            relationship_type = soup.find('dd', attrs={"id":"ajax_monogamous"}).text
            children = soup.find('dd', attrs={"id":"ajax_children"}).text
            pets = soup.find('dd', attrs={"id":"ajax_pets"}).text
            languages = soup.find('dd', attrs={"id":"ajax_languages"}).text
            userImage = soup.find('img')['src']

            userDict = {'userName': user, 'age':age, 'location':location, 'gender': gender, 'desired_gender': desired_gender,
                        'desired_ages':desired_ages, 'desired_distance':desired_distance, 'desired_single':desired_single,
                        'desired_relationship_status':desired_relationship_status, 'orientation':orientation, 
                        'ethnicity':ethnicity, 'height':height, 'body_type':body_type, 'diet':diet, 'smokes':smokes,
                        'drinks':drinks, 'drugs':drugs, 'religion':religion, 'education':education, 'job':job, 'income':income, 
                        'relationship_status':relationship_status, 'relationship_type':relationship_type, 'children':children,
                        'pets':pets, 'languages':languages, 'userImage':userImage}

            userInfo.append(userDict)
            time.sleep(1) # don't pound the OKCupid servers

            
        except:
            #print "ERROR ON %s" % user
            continue
    return userInfo
In [21]:
def get_images(df):
    df = df[-df['userImage'].isnull()]
    userImages = df['userImage']
    userNames = df['userName']
    
    for name, image in zip (userNames, userImages):
        urllib.urlretrieve(image, './images/'+ name +".jpg")
        
In [28]:
#main stuff here
username = '<username>'
password = '<password>'

opener = setup_connection(username = username, password = password)
userSet = scrape_OKC(iters=1, opener=opener)  #scrapes OKC for usernames iter times, each time it gets 10 non unique users
userInfo = parse_user(userSet) #gets info on each unique username
userDataFrame = pd.DataFrame(userInfo) #move data into a dataframe
get_images(userDataFrame) #gets images for each username
userDataFrame.to_json("OKC_scrape.json") #writes out the DF