Utente:.pier.bot./Scripts
filtro.py (ver. 2.4.1) modifica
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Command line option:
-deb Enable debug print
-nofile No creation of local file
-nopage No saving of wiki page
-? Print this help page
-days: Number of days to examine
-start: Specify start offset (in days, 0=today, 1=yesterday. 2=...)
-num: Specify number of pages (if day==0 total number of pages, if day!=0 number of pages per block)
-file: Specify file name for report
-size: Size min of page (for longer filter)
-sizeshort: Size max of page (for shorter filter)
-progetti select pages for all project
-archi select pages for 'Architettura'
-volley select pages for 'Pallavolo'
-ath select pages for 'Atletica'
"""
# ------------------------------
# Script: filtro.py
# Author: %Pier%
# Thanks to: Jalo
# version: 2.4.1 date: 28/06/2014
# ------------------------------
VER = "filtro.py 2.4.1"
import wikipedia
import pagegenerators
import sys
import datetime
import time
import re
from wikipedia import Page
from family import Family
from wiki_lib import GetSize
from wiki_lib import getPageParameter
from wiki_lib import NewpagesPageGeneratorOffset
from wiki_lib import NewpagesPageGeneratorOffsetParam
#index for getPageParameter result
CR_TIME = 1
AUTHOR = 2
# template per tutte le pagine
template_list = ( 'A', 'C', 'E', 'P', 'S', 'T', 'W', 'V',
'Controlcopy', 'Disambigua', 'Disambig', 'Sigla2', 'Sigla3', 'WND', 'Bufala',
'Cancellazione', 'Da cancellare', 'Da correggere', 'Trasferimento',
'F', 'ViolazioneCopyright', 'Stub', 'ListaBio', 'Tabellone tennis' )
# template solo per le pagine corte
template_list_short = ( 'O', 'WIP', 'Tmp', 'Protetta', 'Avvisobloccominaccia', 'Avvisoblocco scad' )
# template solo per le pagine lunghe
template_list_long = ( )
# autori (solitamente bot) da non controllare
author_white_list = ( 'Biobot', 'Gacbot', 'BimBot', 'Kaspobot', 'NuclearBot', 'HubbleBot' )
# parole chiave per selezione progetti
reArchi = re.compile("architett", re.I)
reNotArchi = re.compile("(microcontroll|processor|videogioc|informatic|software|\{\{disambigua\}\}|\{\{cancellazione|\{\{Territori|\{\{Film|Crittografia|Categoria:Motori)", re.I)
reVolley = re.compile("pallavol", re.I)
reNotVolley = re.compile("\{\{disambigua\}\}", re.I)
reAthletics = re.compile("atletica", re.I)
reNotAthletics = re.compile("\{\{disambigua\}\}", re.I)
# dati globali
resultShort = ""
resultLong = ""
resultShortArchi = ""
resultLongArchi = ""
resultVolley = ""
resultAthletics = ""
count_short = 0
count_long = 0
count_short_archi = 0
count_long_archi = 0
count_volley = 0
count_athletics = 0
count_redirect = 0
################################################################################
# ritorna testo con dati della pagina wiki
#
def PageData(page, timestamp = None, username = None, size = 0):
Text = ""
Text = ('# %s ' % timestamp.encode( "utf-8" ) )
Text += ('%s' % (page) )
Text += (' - [%d byte]' % (size) )
Text += (' - [[Utente:%s|]] ' % username.encode( "utf-8" ) )
if len(page.imagelinks()) == 0:
Text += (' - <span style="color:#FF0000">no immagini</span>')
else:
Text += (' - <span style="color:#008000">immagini</span>')
if len(page.interwiki()) == 0:
Text += (' - <span style="color:#FF0000">no interwiki</span>')
else:
Text += (' - <span style="color:#008000">interwiki</span>')
if len(page.categories()) == 0:
Text += (' - <span style="color:#FF0000">no categorie</span>\n')
else:
Text += (' - <span style="color:#008000">categorie</span>\n')
return Text
################################################################################
# filtro pagine per progetto architettura
#
def filtro_archi(page, text, timestamp = None, username = None, size = 0):
global resultShortArchi, resultLongArchi
global count_short_archi, count_long_archi
SelectArchi = False
if (reArchi.search(text) and reNotArchi.search(text) == None):
SelectArchi = True
if SelectArchi:
if test:
print ("Page (archi): %s" % page )
SelectedLongArchi = False
SelectedShortArchi = False
if size > size_short:
SelectedLongArchi = True
else:
SelectedShortArchi = True
if (SelectedLongArchi or SelectedShortArchi):
finalText = PageData(page, timestamp, username, size)
if SelectedLongArchi:
count_long_archi += 1
if savefile:
fal.write(finalText)
if savepage:
resultLongArchi += finalText;
if SelectedShortArchi:
count_short_archi += 1
if savefile:
fas.write(finalText)
if savepage:
resultShortArchi += finalText;
################################################################################
# filtro pagine per progetto pallavolo
#
def filtro_volley(page, text, timestamp = None, username = None, size = 0):
global resultVolley
global count_volley
SelectVolley = False
if (reVolley.search(text) and reNotVolley.search(text) == None):
SelectVolley = True
if SelectVolley:
if test:
print ("Page (volley): %s" % page )
finalText = PageData(page, timestamp, username, size)
count_volley += 1
if savefile:
fv.write(finalText)
if savepage:
resultVolley += finalText;
################################################################################
# filtro pagine per progetto atletica
#
def filtro_athletics(page, text, timestamp = None, username = None, size = 0):
global resultAthletics
global count_athletics
SelectAthletics = False
if (reAthletics.search(text) and reNotAthletics.search(text) == None):
SelectAthletics = True
if SelectAthletics:
if test:
print ("Page (athletics): %s" % page )
finalText = PageData(page, timestamp, username, size)
count_athletics += 1
if savefile:
fat.write(finalText)
if savepage:
resultAthletics += finalText;
################################################################################
# funzione principale di filtro pagine
#
def workon(page, timestamp = None, username = None):
global resultLong, resultShort
global count_short, count_long, count_redirect
#global count
if test:
time_start_f = time.time()
if page.isRedirectPage():
count_redirect += 1
finalText = ('# %s \n' % (page) )
if savefile:
fr.write(finalText)
try:
text = page.get(get_redirect=True)
if test:
print ("----------------------------------------------------")
print ("Page (%d) %s" % (count, page) )
#print (" - begin text ----------------")
#print (" Text is: %s" % text.encode( "utf-8" ) )
#print (" - end text ------------------")
size = page.GetSize()
if select_archi:
filtro_archi(page, text, timestamp, username, size)
if select_volley:
filtro_volley(page, text, timestamp, username, size)
if select_athletics:
filtro_athletics(page, text, timestamp, username, size)
SelectedLong = True
SelectedShort = True
if size > size_short:
SelectedShort = False
if size < size_long:
SelectedLong = False
if not (SelectedLong or SelectedShort):
if test:
print (" Page not selected for size (%d)" % size)
return
templates = page.templates()
if test:
print (" Template in page: %s" % templates)
for t in templates:
if t in template_list:
if test:
print (" Page not selected for template: (%s)" % t)
return
if SelectedLong:
for t in templates:
if t in template_list_long:
if test:
print (" Page not selected for template: (%s)" % t)
return
if SelectedShort:
for t in templates:
if t in template_list_short:
if test:
print (" Page not selected for template: (%s)" % t)
return
if (SelectedLong or SelectedShort):
if (timestamp == None or username == None):
pageparam = page.getPageParameter()
if pageparam == None:
print(" pageparam = None, skip page %s" % page)
if savefile:
fe.write("Skip page: %s\n" % page)
return
timestamp = pageparam[CR_TIME]
username = pageparam[AUTHOR]
if username in author_white_list:
if test:
print(" Author %s in whitelist" % username)
return
finalText = PageData(page, timestamp, username, size)
if SelectedLong:
count_long += 1
if savefile:
fl.write(finalText)
if savepage:
resultLong += finalText;
if test:
print(" --> Long %s" % page)
if SelectedShort:
count_short += 1
if savefile:
fs.write(finalText)
if savepage:
resultShort += finalText;
if test:
print(" --> Short %s" % page)
except wikipedia.IsRedirectPage:
if test:
print("Double Redirect ? %s" % page)
if savefile:
fe.write("Double Redirect ? %s\n" % page)
return
except wikipedia.NoPage:
print("Page doesn't exist! %s" % page)
if savefile:
fe.write("Page doesn't exist! %s\n" % page)
return
except wikipedia.SectionError:
print("Redirect: SectionError! %s" % page)
if savefile:
fe.write("Redirect: SectionError! %s\n" % page)
return
except wikipedia.InvalidTitle:
print("InvalidTitle! %s" % page)
if savefile:
fe.write("InvalidTitle! %s\n" % page)
return
###################################################################################################
#
# m a i n
#
###################################################################################################
try:
# defining all default values
filename = "report"
filenameArchi = "report-archi"
filenameVolley = "report-volley"
filenameAthletics = "report-athletics"
botcomment = VER
test = False
count = 0
days = 1
start_offset = 0
num_pages = 100
size_long = 4000
size_short = 1000
savefile = True
savepage = True
OffsetStringStart = None
OffsetStringEnd = None
select_archi = False
select_volley = False
select_athletics = False
# handling command line parameters
for arg in wikipedia.handleArgs():
if arg.startswith("-deb"):
test = True
elif arg.startswith("-nofile"):
savefile = False
elif arg.startswith("-nopage"):
savepage = False
elif arg.startswith("-start:"):
start_offset = int(arg[7:])
if start_offset < 1:
start_offset = 1
if start_offset > 30:
start_offset = 30
DateStart = datetime.datetime.today() - datetime.timedelta(days=int(start_offset))
DateStart = DateStart.replace(hour=23, minute=59, second=59)
UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
DateStart -= UTCOffset
OffsetStringStart = DateStart.strftime("%Y%m%d%H%M%S")
if test:
print("Start day is %s" % OffsetStringStart)
elif arg.startswith("-num:"):
num_pages = int(arg[5:])
if num_pages < 1:
num_pages = 100
if num_pages > 5000:
num_pages = 5000
elif arg.startswith("-size:"):
size_long = int(arg[6:])
if size_long < 10:
size_long = 10
elif arg.startswith("-sizeshort:"):
size_short = int(arg[11:])
if size_short < 10:
size_short = 10
elif arg.startswith("-days:"):
days = int(arg[6:])
if days > 30:
days = 30
elif arg.startswith("-file:"):
filename = arg[6:]
elif arg.startswith("-archi"):
select_archi = True
elif arg.startswith("-volley"):
select_volley = True
elif arg.startswith("-ath"):
select_athletics = True
elif arg.startswith("-progetti"):
select_archi = True
select_volley = True
select_athletics = True
elif arg.startswith("-?"):
wikipedia.showHelp('filtro')
wikipedia.stopme()
# Attach at runtime
Page.GetSize = GetSize
Page.getPageParameter = getPageParameter
wikipedia.output(u'------ Start process ------')
time_start = time.time()
stat_string = ('Report generato il %s <br/>\n' % datetime.date.today() )
stat_string += ('Parametri dello script:<br/>\n')
stat_string += ('Offset di inizio (giorni): %d - Giorni esaminati: %d<br/>\n\n' % ((start_offset, days) ))
print(stat_string)
resultShort = resultLong = stat_string;
resultLongArchi = resultShortArchi = ('Report generato il %s <br/>\n' % datetime.date.today() )
resultVolley = resultAthletics = ('Report generato il %s <br/>\n' % datetime.date.today() )
DateEnd = DateStart - datetime.timedelta(days=int(days))
OffsetStringEnd = DateEnd.strftime("%Y%m%d%H%M%S")
if test:
print("End day is %s" % OffsetStringEnd)
if savefile:
fl = open(filename+'-long.txt','w+')
fs = open(filename+'-short.txt','w+')
fr = open(filename+'-redir.txt','w+')
fe = open(filename+'-error-log.txt','w+')
if select_archi:
fal = open(filenameArchi+'-long.txt','w+')
fas = open(filenameArchi+'-short.txt','w+')
if select_volley:
fv = open(filenameVolley+'.txt','w+')
if select_athletics:
fat = open(filenameAthletics+'.txt','w+')
mysite = wikipedia.getSite()
# usa due generatori perchè non riesco ad ottenere con [for page in generator:]
# l'inizializzare di tutti i parametri oltre a page, il problema è PreloadingGenerator che
# rilascia solo l'item page anche se newpages (e derivati) rendono una tupla (di più item)
basicgenerator = NewpagesPageGeneratorOffset(number=num_pages, OffsetStart=OffsetStringStart, OffsetEnd=OffsetStringEnd, repeat=True, site=mysite, days=days)
basicgenerator2 = NewpagesPageGeneratorOffsetParam(number=num_pages, OffsetStart=OffsetStringStart, OffsetEnd=OffsetStringEnd, repeat=True, site=mysite, days=days)
# provare senza PreloadingGenerator? misurare se è molto più lento
generator = pagegenerators.PreloadingGenerator(basicgenerator, pageNumber=num_pages)
for page in generator:
count += 1
#print("Page (%d) %s" % (count, page) )
page2, timestamp, length, loggedIn, username, comment = basicgenerator2.next()
username = username.replace(" (pagina inesistente)", "")
username = username.replace("Contributi/", "")
if (page != page2):
print("Page mismatch: %s" % page)
timestamp, length, loggedIn, username, comment = None
#if test:
# print("Page (%d) %s" % (count, page) )
workon(page, timestamp, username)
if savefile:
fl.close()
fs.close()
fr.close()
fe.close()
if select_archi:
fal.close()
fas.close()
if select_volley:
fv.close()
if select_athletics:
fat.close()
if count == 0:
wikipedia.output(u'------ Some error: no page to check ------')
wikipedia.stopme()
sys.exit()
if savepage:
wikipedia.output(u'------ Save pages ------')
PageShort = wikipedia.Page(mysite, "Utente:.pier.bot./pagine corte")
PageLong = wikipedia.Page(mysite, "Utente:.pier.bot./pagine lunghe")
PageLog = wikipedia.Page(mysite, "Utente:.pier.bot./log")
PageShort.put(resultShort.decode( "utf-8" ), comment=botcomment);
PageLong.put(resultLong.decode( "utf-8" ), comment=botcomment);
if select_archi:
PageLongArchi = wikipedia.Page(mysite, "Progetto:Architettura/Nuove (automatiche)")
PageShortArchi = wikipedia.Page(mysite, "Progetto:Architettura/Stub (automatiche)")
PageLongArchi.put(resultLongArchi.decode( "utf-8" ), comment=botcomment);
PageShortArchi.put(resultShortArchi.decode( "utf-8" ), comment=botcomment);
if select_volley:
PageVolley = wikipedia.Page(mysite, "Progetto:Sport/Pallavolo/Nuove (automatiche)")
PageVolley.put(resultVolley.decode( "utf-8" ), comment=botcomment);
if select_athletics:
PageAthletics = wikipedia.Page(mysite, "Progetto:Sport/Atletica leggera/Nuove (automatiche)")
PageAthletics.put(resultAthletics.decode( "utf-8" ), comment=botcomment);
wikipedia.output(u'--- Statistics ------------')
time_end = time.time()
time_el = time_end - time_start
stat_string += ('Pagine esaminate : %d<br/>\n' % count)
stat_string += ('Pagine corte selezionate : %d (%d%%)<br/>\n' % (count_short, 100*count_short/count) )
stat_string += ('Pagine lunghe selezionate: %d (%d%%)<br/>\n' % (count_long, 100*count_long/count) )
stat_string += ('Redirect : %d<br/>\n\n' % count_redirect)
if select_archi:
stat_string += ('- Architettura -<br/>\n')
stat_string += ('Abbozzi : %d (%d%%)<br/>\n' % (count_short_archi, 100*count_short_archi/count) )
stat_string += ('Voci : %d (%d%%)<br/>\n\n' % (count_long_archi, 100*count_long_archi/count) )
if select_volley:
stat_string += ('- Pallavolo -<br/>\n')
stat_string += ('Voci : %d (%d%%)<br/>\n\n' % (count_volley, 100*count_volley/count) )
if select_athletics:
stat_string += ('- Atletica -<br/>\n')
stat_string += ('Voci : %d (%d%%)<br/>\n\n' % (count_athletics, 100*count_athletics/count) )
stat_string += ('Tempo impiegato: %3d:%02d <br/>\n' % (time_el / 60, time_el % 60) )
print(stat_string)
if savepage:
PageLog.put(stat_string.decode( "utf-8" ), comment=botcomment);
wikipedia.output(u'------ End process ------')
finally:
wikipedia.stopme()
wiki_lib.py (ver. 2.00) modifica
# -*- coding: utf-8 -*-
__version__='$Id: wiki_lib.py,v 2.00 2011/02/19 22:44::00 %Pier% $'
"""
Library to extend pywikipedia framework
"""
import re
import datetime
import query
from wikipedia import Page
from wikipedia import Site
from wikipedia import removeLanguageLinks
from wikipedia import removeCategoryLinks
from wikipedia import get_throttle
from family import Family
#
# class that will be added to Page
#
def GetSize(self):
"""
Returns the actual size of page (excluding interwiki & category)
"""
txt = self.get()
txt = removeLanguageLinks(txt)
txt = removeCategoryLinks(txt, site = self.site())
return len(txt)
def getPageParameter(self):
"""
Returns the page author & creation timestamp (including anonymous IP)
"""
edits = self.getVersionHistory(reverseOrder=True, revCount=1)
#print(edits)
try:
return edits[0]
except IndexError:
rev = self.latestRevision()
user = self.userName()
edtime = self.editTime()
return rev, edtime, user, None
def getAuthor(self):
"""
Returns the page author (including anonymous IP)
"""
edits = self.getVersionHistory(reverseOrder = True)
# edits = self.getVersionHistory()
# return edits[len(edits)-1][2]
return edits[0][2]
def getCreationTime(self):
"""
Returns the page creation timestamp (including anonymous IP)
"""
try:
edits = self.getVersionHistory(reverseOrder = True)
except:
print("Error in getCreationTime(): page %s edits %s" % (self, edits) )
# return edits[len(edits)-1][1]
return edits[0][1]
#
# class that will be added to Site (in wikipedia.py)
#
def newpages_address_off(self, n=50, OffsetString=None):
return self.family.newpages_address_offset(self.lang, n, OffsetString)
#
# class that will be added to Family (in family.py)
#
def newpages_address_offset(self, code, limit=50, OffsetString=None):
return "%s?title=%s:Newpages&limit=%d&offset=%s" % (self.path(code), self.special_namespace_url(code), limit, OffsetString)
#
# new class (derived from pagegenerators.py NewpagesPageGenerator)
#
def NewpagesPageGeneratorOffset(number = 100, OffsetStart = None, OffsetEnd = None, repeat = False, site = None, days = None):
if site is None:
site = wikipedia.getSite()
for page in newpagesoffset(site=site, number=number, OffsetStart=OffsetStart, OffsetEnd=OffsetEnd, repeat=repeat, days = days):
yield page[0]
def NewpagesPageGeneratorOffsetParam(number = 100, OffsetStart = None, OffsetEnd = None, repeat = False, site = None, days = None):
if site is None:
site = wikipedia.getSite()
for (page, date, length, loggedIn, username, comment) in newpagesoffset(site=site, number=number, OffsetStart=OffsetStart, OffsetEnd=OffsetEnd, repeat=repeat, days = days):
yield page, date, length, loggedIn, username, comment
#
# new class (derived from wikipedia.py Site.newpages)
#
def newpagesoffset_old(site = None, number = 10, repeat = False, OffsetString = None, days = None):
#def newpages(self, number = 10, repeat = False):
"""Generator which yields new articles subsequently.
It starts with the article created 'number' articles
ago (first argument). When these are all yielded
and repeat is True,
it fetches NewPages again. If there is no new page,
it blocks until there is one, sleeping between subsequent
fetches of NewPages.
The objects yielded are dictionairies. The keys are
date (datetime object), title (pagelink), length (int)
user_login (only if user is logged in, string), comment
(string) and user_anon (if user is not logged in, string).
"""
# Begin -------------------------------------------- %Pier%
# Attach at runtime
Site.newpages_address_off = newpages_address_off
Family.newpages_address_offset = newpages_address_offset
UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
# End -------------------------------------------- %Pier%
# The throttling is important here, so always enabled.
if repeat:
throttle = True
seen = set()
# Begin -------------------------------------------- %Pier%
if OffsetString == None:
DateStart = datetime.datetime.today() - datetime.timedelta(days=int(days))
DateStart -= UTCOffset
else:
DateStart = datetime.datetime.strptime(OffsetString, "%Y%m%d%H%M%S") - datetime.timedelta(days=int(days))
#DateStart = DateStart.replace(hour=23, minute=59, second=59)
#DateStart -= UTCOffset
#DateStart -= UTCOffset
# End -------------------------------------------- %Pier%
while True:
# Begin -------------------------------------------- %Pier%
PageFound = False
if OffsetString == None:
path = site.newpages_address(n=number)
else:
path = site.newpages_address_off(n=number, OffsetString=OffsetString)
# End -------------------------------------------- %Pier%
get_throttle()
# Begin -------------------------------------------- %Pier%
#html = self.getUrl(path)
html = site.getUrl(path)
# End -------------------------------------------- %Pier%
# entryR = re.compile('<li[^>]*>(?P<mw-newpages-time>.+?) \S*?<a href=".+?" title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<mw-newpages-length>\d+)[^\)\]]*[\)\]] .?<a href=".+?" title=".+?:(?P<username>.+?)"')
entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
for m in entryR.finditer(html):
date = m.group('date')
print date
title = m.group('title')
title = title.replace('"', '"')
length = int(m.group('length'))
loggedIn = u''
username = m.group('username')
comment = u''
if title not in seen:
# Begin -------------------------------------------- %Pier%
PageFound = True
seen.add(title)
#page = Page(self, title)
page = Page(site, title)
# End -------------------------------------------- %Pier%
yield page, date, length, loggedIn, username, comment
if not repeat:
break
# Begin -------------------------------------------- %Pier%
listaMesi={'gen': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'mag': '05', 'giu': '06', 'lug': '07', 'ago': '08', 'set': '09', 'ott': '10', 'nov': '11', 'dic': '12', }
if days == None:
days = 0
oramin, giorno, mese, anno = date.split(' ')
mese = int(listaMesi[mese])
ora, minuti = oramin.replace(",", "").split(':')
#DateStart = datetime.datetime.today() - datetime.timedelta(days=int(days))
DatePage = datetime.datetime(int(anno), mese, int(giorno), int(ora), int(minuti))
#UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
DatePage -= UTCOffset
OffsetString = DatePage.strftime("%Y%m%d%H%M%S")
if DatePage < DateStart:
break
# allow exit when NewPages ended
if days > 0 and not PageFound:
break
# End -------------------------------------------- %Pier%
#
# new class (derived from wikipedia.py Site.newpages)
#
def newpagesoffset(site = None, number = 10, get_redirect = False, repeat = False, namespace = 0, OffsetStart = None, OffsetEnd = None, days = None):
#def newpages(self, number = 10, get_redirect = False, repeat = False, namespace = 0):
"""Yield new articles (as Page objects) from Special:Newpages.
Starts with the newest article and fetches the number of articles
specified in the first argument. If repeat is True, it fetches
Newpages again. If there is no new page, it blocks until there is
one, sleeping between subsequent fetches of Newpages.
The objects yielded are tuples composed of the Page object,
timestamp (unicode), length (int), an empty unicode string, username
or IP address (str), comment (unicode).
"""
# TODO: in recent MW versions Special:Newpages takes a namespace parameter,
# and defaults to 0 if not specified.
# TODO: Detection of unregistered users is broken
# TODO: Repeat mechanism doesn't make much sense as implemented;
# should use both offset and limit parameters, and have an
# option to fetch older rather than newer pages
seen = set()
while True:
if site.has_api() and site.versionnumber() >= 10:
params = {
'action': 'query',
'list': 'recentchanges',
'rctype': 'new',
# Begin -------------------------------------------- %Pier%
'rcstart': OffsetStart,
'rcend': OffsetEnd,
# End -------------------------------------------- %Pier%
'rcnamespace': namespace,
'rclimit': int(number),
'rcprop': ['ids','title','timestamp','sizes','user','comment'],
'rcshow': ['!bot','!redirect'],
#'': '',
}
data = query.GetData(params, site)['query']['recentchanges']
# Begin -------------------------------------------- %Pier%
PageFound = False
# End -------------------------------------------- %Pier%
for np in data:
if np['pageid'] not in seen:
seen.add(np['pageid'])
page = Page(site, np['title'], defaultNamespace=np['ns'])
# Begin -------------------------------------------- %Pier%
PageFound = True
# End -------------------------------------------- %Pier%
yield page, np['timestamp'], np['newlen'], u'', np['user'], np['comment']
# Begin -------------------------------------------- %Pier%
OffsetStart = np['timestamp']
if not PageFound:
break
# End -------------------------------------------- %Pier%
else:
path = site.newpages_address(n=number, namespace=namespace)
# The throttling is important here, so always enabled.
get_throttle()
html = site.getUrl(path)
entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
for m in entryR.finditer(html):
date = m.group('date')
title = m.group('title')
title = title.replace('"', '"')
length = int(re.sub("[,.]", "", m.group('length')))
loggedIn = u''
username = m.group('username')
comment = u''
if title not in seen:
seen.add(title)
page = Page(site, title)
yield page, date, length, loggedIn, username, comment
if not repeat:
break