#!/usr/bin/env python ''' get_pageview_by_day.py - batch download wiki pageview history Usage: python get_pageview_by_day.py -s -e -d E.g. python get_pageview_by_day.py -s 20130410 -e 20130417 -d /Data/ The parameters will be set as default if not given. @author: Brian Keegan and Yu-Ru Lin @contact: bkeegan@gmail.com and yuruliny@gmail.com @date: October 22, 2013 ''' import os,urllib2,hashlib from bs4 import BeautifulSoup from dateutil.relativedelta import relativedelta from datetime import datetime, timedelta datapath = '/scratch/yuru/data/wiki/pagecounts_by_day' if not os.path.exists(datapath): os.system('mkdir -p %s'%datapath) # From http://stackoverflow.com/questions/4028697/how-do-i-download-a-zip-file-in-python-using-urllib2/4028728 def get_file(url,filepath,md5_dict): # Open the url try: f = urllib2.urlopen(url) #print "downloading " + url # Open our local file for writing with open(filepath+os.path.basename(url), "wb") as local_file: read_file = f.read() file_md5 = hashlib.md5(read_file).hexdigest() if file_md5 == md5_dict[os.path.basename(url)]: local_file.write(read_file) else: raise ValueError('MD5 checksum on {0} does not match master.'.format(os.path.basename(url))) #handle errors except urllib2.HTTPError, e: print "HTTP Error:", e.code, url except urllib2.URLError, e: print "URL Error:", e.reason, url def get_dumps(start,end,datapath=os.getcwd()+'/Data/'): start_dt = datetime.strptime(start,'%Y%m%d') end_dt = datetime.strptime(end,'%Y%m%d') url_base = 'http://dumps.wikimedia.org/other/pagecounts-raw/' if not os.path.exists(datapath): os.system('mkdir -p {0}'.format(datapath)) # Enforce time boundaries if start_dt < datetime(2007,12,10,0,0,0): raise ValueError('Time range must be after 10 Dec 2007') elif end_dt > datetime.today(): raise ValueError('Time range must end before today') index = start_dt while index < end_dt: year = datetime.strftime(index,'%Y') month = datetime.strftime(index,'%m') # Make md5 checksum dictionary that can be checked after downloading files below md5_url = url_base + '{0}/{0}-{1}/'.format(year,month) + 'md5sums.txt' md5s = urllib2.urlopen(md5_url).readlines() md5_dict = dict([i.strip().split(' ') for i in md5s if 'pagecounts' in i]) md5_dict = {v:k for k,v in md5_dict.iteritems()} # Can't pass clean filenames since seconds field varies, scrape filenames instead url = url_base + '{0}/{0}-{1}/'.format(year,month) soup = BeautifulSoup(urllib2.urlopen(url)) pagecount_links = [link.get('href') for link in soup.findAll('a') if 'pagecounts' in link.get('href')] # Return list of links in valid timespan valid_links = [datetime.strptime(i,'pagecounts-%Y%m%d-%H%M%S.gz') for i in pagecount_links] valid_links = [i for i in valid_links if i > start_dt and i < end_dt] valid_links = [datetime.strftime(i,'pagecounts-%Y%m%d-%H%M%S.gz') for i in valid_links] # If the directory doesn't exist, create it filepath = datapath+'{0}/{1}/'.format(year,month) if not os.path.exists(filepath): os.system('mkdir -p {0}'.format(filepath)) # Get the data for link in valid_links: url = url_base + '{0}/{0}-{1}/'.format(year,month) + link # Check if file already exists and has non-zero length if os.path.exists(filepath+link) and os.path.getsize(filepath+link) > 0: print link + ' already exists!' continue try: print "Retrieving pageviews for " + link get_file(url,filepath,md5_dict) except (KeyboardInterrupt, SystemExit): sys.exit(0) break index += relativedelta(months=1) def main(argv): args = set( a.lower() for a in sys.argv[1:] ) for i,arg in enumerate(argv): if arg in ['-h','--help']: print 'Usage: python pageview_scraper.py -s -e -d ' if arg in ['-s','--start']: try: start = argv[i+1] except: start = datetime.strftime(datetime.today()- timedelta(days=1),'%Y%m%d') if arg in ['-e','--end']: try: end = argv[i+1] except: end = datetime.strftime(datetime.today(),'%Y%m%d') if arg in ['-d','--dir']: try: datapath = argv[i+1] except: datapath = os.getcwd()+'/Data/' get_dumps(start,end,datapath) pass if __name__ == '__main__': main(sys.argv[1:])