This scriots gets a lot of random pages from a Wikipedia if their Talk_page is in a category that descsribes their "Class" e.g. "Stub", "Featured Article" etc.

In [1]:
import pywikibot
from pywikibot import pagegenerators
enwp = pywikibot.Site('en','wikipedia')
import re
from collections import defaultdict
VERBOSE:pywiki:Starting 1 threads...
In [10]:
def page_class(page):
    talk = page.toggleTalkPage()
    cats =  talk.categories()
    for cat in cats:
        cat_tit = cat.title().split('Category:')[1]
        match = re.search(r'(\w+)\-Class', cat_tit)
        if match:
            return match.group(1)
    return None
In [13]:
classed_pages = defaultdict(list)

#currently there is a bug in pywikibot that only allows 25 random pages at a time
for i in range(0,2001):
    #print 'making new random'
    random_pages = enwp.randompages(namespaces=[0], step=25, total=25)
    count25 = 0
    for page in random_pages:
        count25 += 1
        wikiclass = page_class(page)
        if wikiclass:
            if wikiclass not in classed_pages.keys():
                print wikiclass
            classed_pages[wikiclass].append(page.get())
            if count25 == 24:
                break
print "done"
                
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Stub
Start
B
C
List
GA
FA
Disambig
FL
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
A
NA
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Redirect
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
unassessed
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
WARNING: Http response status 503
WARNING:pywiki:Http response status 503
WARNING: Non-JSON response received from server wikipedia:en; the server may be down.
WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.
Set gcllimit = 2500
INFO:pywiki:Set gcllimit = 2500
WARNING: Waiting 5 seconds before retrying.
WARNING:pywiki:Waiting 5 seconds before retrying.
Current
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Unassessed
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Future
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Needed
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
WARNING: Http response status 503
WARNING:pywiki:Http response status 503
WARNING: Non-JSON response received from server wikipedia:en; the server may be down.
WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.
Set gcllimit = 2500
INFO:pywiki:Set gcllimit = 2500
WARNING: Waiting 5 seconds before retrying.
WARNING:pywiki:Waiting 5 seconds before retrying.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
WARNING: Http response status 503
WARNING:pywiki:Http response status 503
WARNING: Non-JSON response received from server wikipedia:en; the server may be down.
WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.
Set gcllimit = 2500
INFO:pywiki:Set gcllimit = 2500
WARNING: Waiting 5 seconds before retrying.
WARNING:pywiki:Waiting 5 seconds before retrying.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
Deferred
WARNING: Http response status 503
WARNING:pywiki:Http response status 503
WARNING: Non-JSON response received from server wikipedia:en; the server may be down.
WARNING:pywiki:Non-JSON response received from server wikipedia:en; the server may be down.
Set gcllimit = 2500
INFO:pywiki:Set gcllimit = 2500
WARNING: Waiting 5 seconds before retrying.
WARNING:pywiki:Waiting 5 seconds before retrying.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
VERBOSE:pywiki:Found 1 wikipedia:en processes running, including this one.
done
In [14]:
import json
json.dump(classed_pages, open('test_class_data.json','w'))
In [19]:
sum([len(l) for l in classed_pages.itervalues()])
Out[19]:
38959