filtro.py (ver. 2.4.1) modifica

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
Command line option:
-deb        Enable debug print
-nofile     No creation of local file
-nopage     No saving of wiki page
-?          Print this help page

-days:      Number of days to examine
-start:     Specify start offset (in days, 0=today, 1=yesterday. 2=...)
-num:       Specify number of pages (if day==0 total number of pages, if day!=0 number of pages per block)
-file:      Specify file name for report
-size:      Size min of page (for longer filter)
-sizeshort: Size max of page (for shorter filter)
-progetti   select pages for all project
    -archi      select pages for 'Architettura'
    -volley     select pages for 'Pallavolo'
    -ath        select pages for 'Atletica'

"""

# ------------------------------
# Script: filtro.py
# Author: %Pier%
# Thanks to: Jalo
# version: 2.4.1  date: 28/06/2014
# ------------------------------
VER = "filtro.py 2.4.1"

import wikipedia
import pagegenerators
import sys
import datetime
import time
import re

from wikipedia import Page
from family import Family
from wiki_lib import GetSize
from wiki_lib import getPageParameter
from wiki_lib import NewpagesPageGeneratorOffset
from wiki_lib import NewpagesPageGeneratorOffsetParam

#index for getPageParameter result
CR_TIME = 1
AUTHOR  = 2

# template per tutte le pagine
template_list = ( 'A', 'C', 'E', 'P', 'S', 'T', 'W', 'V',
                'Controlcopy', 'Disambigua', 'Disambig', 'Sigla2', 'Sigla3', 'WND', 'Bufala', 
                'Cancellazione', 'Da cancellare', 'Da correggere', 'Trasferimento', 
                'F', 'ViolazioneCopyright', 'Stub', 'ListaBio', 'Tabellone tennis' )

# template solo per le pagine corte
template_list_short = ( 'O', 'WIP', 'Tmp', 'Protetta', 'Avvisobloccominaccia', 'Avvisoblocco scad' )
# template solo per le pagine lunghe
template_list_long = ( )

# autori (solitamente bot) da non controllare
author_white_list = ( 'Biobot', 'Gacbot', 'BimBot', 'Kaspobot', 'NuclearBot', 'HubbleBot' )

# parole chiave per selezione progetti
reArchi = re.compile("architett", re.I)
reNotArchi = re.compile("(microcontroll|processor|videogioc|informatic|software|\{\{disambigua\}\}|\{\{cancellazione|\{\{Territori|\{\{Film|Crittografia|Categoria:Motori)", re.I)

reVolley = re.compile("pallavol", re.I)
reNotVolley = re.compile("\{\{disambigua\}\}", re.I)

reAthletics = re.compile("atletica", re.I) 
reNotAthletics = re.compile("\{\{disambigua\}\}", re.I)

# dati globali
resultShort = ""
resultLong = ""
resultShortArchi = ""
resultLongArchi = ""
resultVolley = ""
resultAthletics = ""
count_short = 0
count_long = 0
count_short_archi = 0
count_long_archi = 0
count_volley = 0
count_athletics = 0 
count_redirect = 0

################################################################################
#   ritorna testo con dati della pagina wiki
#
def PageData(page, timestamp = None, username = None, size = 0): 
    Text = ""
    Text = ('# %s ' % timestamp.encode( "utf-8" ) )
    Text += ('%s' % (page) )
    Text += (' - [%d byte]' % (size) )
    Text += (' - [[Utente:%s|]] ' % username.encode( "utf-8" ) )
    if len(page.imagelinks()) == 0:
        Text += (' - <span style="color:#FF0000">no immagini</span>')
    else:
        Text += (' - <span style="color:#008000">immagini</span>')
    if len(page.interwiki()) == 0:
        Text += (' - <span style="color:#FF0000">no interwiki</span>')
    else:
        Text += (' - <span style="color:#008000">interwiki</span>')
    if len(page.categories()) == 0:
        Text += (' - <span style="color:#FF0000">no categorie</span>\n')
    else:
        Text += (' - <span style="color:#008000">categorie</span>\n')
    return Text
    
################################################################################
#   filtro pagine per progetto architettura
#
def filtro_archi(page, text, timestamp = None, username = None, size = 0):
    global resultShortArchi, resultLongArchi
    global count_short_archi, count_long_archi

    SelectArchi = False
    if (reArchi.search(text) and reNotArchi.search(text) == None):
        SelectArchi = True
        
    if SelectArchi:
        if test:
            print ("Page (archi): %s" % page )
        SelectedLongArchi  = False
        SelectedShortArchi = False
        if size > size_short:
            SelectedLongArchi  = True
        else:
            SelectedShortArchi = True

        if (SelectedLongArchi or SelectedShortArchi):
            finalText = PageData(page, timestamp, username, size)

        if SelectedLongArchi:
            count_long_archi += 1
            if savefile:
                fal.write(finalText)
            if savepage:
                resultLongArchi += finalText;

        if SelectedShortArchi:
            count_short_archi += 1
            if savefile:
                fas.write(finalText)
            if savepage:
                resultShortArchi += finalText;

################################################################################
#   filtro pagine per progetto pallavolo
#
def filtro_volley(page, text, timestamp = None, username = None, size = 0):
    global resultVolley
    global count_volley

    SelectVolley = False
    if (reVolley.search(text) and reNotVolley.search(text) == None):
        SelectVolley = True
        
    if SelectVolley:
        if test:
            print ("Page (volley): %s" % page )

        finalText = PageData(page, timestamp, username, size)

        count_volley += 1
        if savefile:
            fv.write(finalText)
        if savepage:
            resultVolley += finalText;

################################################################################
#   filtro pagine per progetto atletica
#
def filtro_athletics(page, text, timestamp = None, username = None, size = 0):
    global resultAthletics
    global count_athletics

    SelectAthletics = False
    if (reAthletics.search(text) and reNotAthletics.search(text) == None):
        SelectAthletics = True
        
    if SelectAthletics:
        if test:
            print ("Page (athletics): %s" % page )

        finalText = PageData(page, timestamp, username, size)

        count_athletics += 1
        if savefile:
            fat.write(finalText)
        if savepage:
            resultAthletics += finalText;


################################################################################
#   funzione principale di filtro pagine
#
def workon(page, timestamp = None, username = None):
    global resultLong, resultShort
    global count_short, count_long, count_redirect
    #global count

    if test:
        time_start_f = time.time()

    if page.isRedirectPage():
        count_redirect += 1
        finalText = ('# %s \n' % (page) )
        if savefile:
            fr.write(finalText)
            
    try:
        text = page.get(get_redirect=True)
        if test:
            print ("----------------------------------------------------")
            print ("Page (%d) %s" % (count, page) )
            #print (" - begin text ----------------")
            #print (" Text is: %s" % text.encode( "utf-8" ) )
            #print (" - end text ------------------")

        size = page.GetSize()

        if select_archi:
            filtro_archi(page, text, timestamp, username, size)
        if select_volley:
            filtro_volley(page, text, timestamp, username, size)
        if select_athletics:
            filtro_athletics(page, text, timestamp, username, size)
        
        SelectedLong  = True
        SelectedShort = True
        if size > size_short:
            SelectedShort = False
        if size < size_long:
            SelectedLong = False
        if not (SelectedLong or SelectedShort):
            if test:
                print (" Page not selected for size (%d)" % size)
            return

        templates = page.templates()
        if test:
            print (" Template in page: %s" % templates)
        for t in templates:
            if t in template_list:
                if test:
                    print (" Page not selected for template: (%s)" % t)
                return
        if SelectedLong:
            for t in templates:
                if t in template_list_long:
                    if test:
                        print (" Page not selected for template: (%s)" % t)
                    return
        if SelectedShort:
            for t in templates:
                if t in template_list_short:
                    if test:
                        print (" Page not selected for template: (%s)" % t)
                    return

        if (SelectedLong or SelectedShort):
            if (timestamp == None or username == None): 
                pageparam = page.getPageParameter()
                if pageparam == None:
                    print(" pageparam = None, skip page %s" % page)
                    if savefile:
                        fe.write("Skip page: %s\n" % page)
                    return
                timestamp = pageparam[CR_TIME]
                username  = pageparam[AUTHOR]
            
            if username in author_white_list:
                if test:
                    print(" Author %s in whitelist" % username)
                return

            finalText = PageData(page, timestamp, username, size)

        if SelectedLong:
            count_long += 1
            if savefile:
                fl.write(finalText)
            if savepage:
                resultLong += finalText;
            if test:
                print(" --> Long %s" % page)

        if SelectedShort:
            count_short += 1
            if savefile:
                fs.write(finalText)
            if savepage:
                resultShort += finalText;
            if test:
                print(" --> Short %s" % page)
    
    except wikipedia.IsRedirectPage:
        if test:
            print("Double Redirect ? %s" % page)
        if savefile:
            fe.write("Double Redirect ? %s\n" % page)
        return

    except wikipedia.NoPage:
        print("Page doesn't exist! %s" % page)
        if savefile:
            fe.write("Page doesn't exist! %s\n" % page)
        return

    except wikipedia.SectionError:
        print("Redirect: SectionError! %s" % page)
        if savefile:
            fe.write("Redirect: SectionError!  %s\n" % page)
        return
    
    except wikipedia.InvalidTitle:
        print("InvalidTitle! %s" % page)
        if savefile:
            fe.write("InvalidTitle!  %s\n" % page)
        return


###################################################################################################
#
#   m a i n 
#
###################################################################################################
try:
    # defining all default values
    filename = "report"
    filenameArchi = "report-archi"
    filenameVolley = "report-volley"
    filenameAthletics = "report-athletics"
    botcomment = VER
    test = False
    count = 0
    days = 1
    start_offset = 0
    num_pages = 100
    size_long = 4000
    size_short = 1000
    savefile = True
    savepage = True
    OffsetStringStart = None
    OffsetStringEnd = None
    select_archi = False
    select_volley = False
    select_athletics = False

    # handling command line parameters
    for arg in wikipedia.handleArgs():
        if arg.startswith("-deb"):
            test = True
        elif arg.startswith("-nofile"):
            savefile = False
        elif arg.startswith("-nopage"):
            savepage = False
        elif arg.startswith("-start:"):
            start_offset = int(arg[7:])
            if start_offset < 1:
                start_offset = 1
            if start_offset > 30:
                start_offset = 30
            DateStart = datetime.datetime.today() - datetime.timedelta(days=int(start_offset))
            DateStart = DateStart.replace(hour=23, minute=59, second=59)
            UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
            DateStart -= UTCOffset
            OffsetStringStart = DateStart.strftime("%Y%m%d%H%M%S")
            if test:
                print("Start day is %s" % OffsetStringStart)
        elif arg.startswith("-num:"):
            num_pages = int(arg[5:])
            if num_pages < 1:
                num_pages = 100
            if num_pages > 5000:
                num_pages = 5000
        elif arg.startswith("-size:"):
            size_long = int(arg[6:])
            if size_long < 10:
                size_long = 10
        elif arg.startswith("-sizeshort:"):
            size_short = int(arg[11:])
            if size_short < 10:
                size_short = 10
        elif arg.startswith("-days:"):
            days = int(arg[6:])
            if days > 30:
                days = 30
        elif arg.startswith("-file:"):
            filename = arg[6:]
        elif arg.startswith("-archi"):
            select_archi = True
        elif arg.startswith("-volley"):
            select_volley = True
        elif arg.startswith("-ath"):
            select_athletics = True
        elif arg.startswith("-progetti"):
            select_archi = True
            select_volley = True
            select_athletics = True
        elif arg.startswith("-?"):
            wikipedia.showHelp('filtro')
            wikipedia.stopme()

    # Attach at runtime
    Page.GetSize = GetSize
    Page.getPageParameter = getPageParameter
    
    wikipedia.output(u'------ Start process ------')
    time_start = time.time()
    stat_string =  ('Report generato il %s <br/>\n' % datetime.date.today() )
    stat_string += ('Parametri dello script:<br/>\n')    
    stat_string += ('Offset di inizio (giorni): %d  -  Giorni esaminati: %d<br/>\n\n' % ((start_offset, days) ))    
    print(stat_string)
    resultShort = resultLong = stat_string;

    resultLongArchi = resultShortArchi = ('Report generato il %s <br/>\n' % datetime.date.today() )
    resultVolley = resultAthletics = ('Report generato il %s <br/>\n' % datetime.date.today() )
    
    DateEnd = DateStart - datetime.timedelta(days=int(days))
    OffsetStringEnd = DateEnd.strftime("%Y%m%d%H%M%S")
    if test:
        print("End day is %s" % OffsetStringEnd)

    if savefile:
        fl = open(filename+'-long.txt','w+')
        fs = open(filename+'-short.txt','w+')
        fr = open(filename+'-redir.txt','w+')
        fe = open(filename+'-error-log.txt','w+')
        if select_archi:
            fal = open(filenameArchi+'-long.txt','w+')
            fas = open(filenameArchi+'-short.txt','w+')
        if select_volley:
            fv = open(filenameVolley+'.txt','w+')
        if select_athletics:
            fat = open(filenameAthletics+'.txt','w+')

    mysite = wikipedia.getSite()
    # usa due generatori perchè non riesco ad ottenere con [for page in generator:] 
    # l'inizializzare di tutti i parametri oltre a page, il problema è PreloadingGenerator che
    # rilascia solo l'item page anche se newpages (e derivati) rendono una tupla (di più item)  
    basicgenerator  = NewpagesPageGeneratorOffset(number=num_pages, OffsetStart=OffsetStringStart, OffsetEnd=OffsetStringEnd, repeat=True, site=mysite, days=days)
    basicgenerator2 = NewpagesPageGeneratorOffsetParam(number=num_pages, OffsetStart=OffsetStringStart, OffsetEnd=OffsetStringEnd, repeat=True, site=mysite, days=days)
    # provare senza PreloadingGenerator? misurare se è molto più lento 
    generator = pagegenerators.PreloadingGenerator(basicgenerator, pageNumber=num_pages)
    for page in generator:
        count += 1
        #print("Page (%d) %s" % (count, page) )
        page2, timestamp, length, loggedIn, username, comment = basicgenerator2.next()
        username = username.replace(" (pagina inesistente)", "")
        username = username.replace("Contributi/", "")
        if (page != page2):
            print("Page mismatch: %s" % page)
            timestamp, length, loggedIn, username, comment = None
        #if test:
        #    print("Page (%d) %s" % (count, page) )
        workon(page, timestamp, username)
        
    if savefile:
        fl.close()
        fs.close()
        fr.close()
        fe.close()
        if select_archi:
            fal.close()
            fas.close()
        if select_volley:
            fv.close()
        if select_athletics:
            fat.close()

    if count == 0:
        wikipedia.output(u'------ Some error: no page to check ------')
        wikipedia.stopme()
        sys.exit()

    if savepage:
        wikipedia.output(u'------ Save pages ------')
        PageShort = wikipedia.Page(mysite, "Utente:.pier.bot./pagine corte")
        PageLong = wikipedia.Page(mysite, "Utente:.pier.bot./pagine lunghe")
        PageLog = wikipedia.Page(mysite, "Utente:.pier.bot./log")
        PageShort.put(resultShort.decode( "utf-8" ), comment=botcomment);
        PageLong.put(resultLong.decode( "utf-8" ), comment=botcomment);
        if select_archi:
            PageLongArchi = wikipedia.Page(mysite, "Progetto:Architettura/Nuove (automatiche)")
            PageShortArchi = wikipedia.Page(mysite, "Progetto:Architettura/Stub (automatiche)")
            PageLongArchi.put(resultLongArchi.decode( "utf-8" ), comment=botcomment);
            PageShortArchi.put(resultShortArchi.decode( "utf-8" ), comment=botcomment);
        if select_volley:
            PageVolley = wikipedia.Page(mysite, "Progetto:Sport/Pallavolo/Nuove (automatiche)")
            PageVolley.put(resultVolley.decode( "utf-8" ), comment=botcomment);
        if select_athletics:
            PageAthletics = wikipedia.Page(mysite, "Progetto:Sport/Atletica leggera/Nuove (automatiche)")
            PageAthletics.put(resultAthletics.decode( "utf-8" ), comment=botcomment);
        
    wikipedia.output(u'--- Statistics ------------')
    time_end = time.time()
    time_el = time_end - time_start
    stat_string += ('Pagine esaminate         : %d<br/>\n' % count)
    stat_string += ('Pagine corte selezionate : %d (%d%%)<br/>\n' % (count_short, 100*count_short/count) )
    stat_string += ('Pagine lunghe selezionate: %d (%d%%)<br/>\n' % (count_long, 100*count_long/count) )
    stat_string += ('Redirect                 : %d<br/>\n\n' % count_redirect)
    if select_archi:
        stat_string += ('- Architettura  -<br/>\n')
        stat_string += ('Abbozzi   : %d (%d%%)<br/>\n'   % (count_short_archi, 100*count_short_archi/count) )
        stat_string += ('Voci      : %d (%d%%)<br/>\n\n' % (count_long_archi, 100*count_long_archi/count) )
    if select_volley:
        stat_string += ('- Pallavolo  -<br/>\n')
        stat_string += ('Voci      : %d (%d%%)<br/>\n\n' % (count_volley, 100*count_volley/count) )
    if select_athletics:
        stat_string += ('- Atletica   -<br/>\n')
        stat_string += ('Voci      : %d (%d%%)<br/>\n\n' % (count_athletics, 100*count_athletics/count) )

    stat_string += ('Tempo impiegato: %3d:%02d <br/>\n' % (time_el / 60, time_el % 60) )
    
    print(stat_string)

    if savepage:
        PageLog.put(stat_string.decode( "utf-8" ), comment=botcomment);

    wikipedia.output(u'------ End process ------')

finally:
    wikipedia.stopme()

wiki_lib.py (ver. 2.00) modifica

# -*- coding: utf-8  -*-

__version__='$Id: wiki_lib.py,v 2.00 2011/02/19 22:44::00 %Pier% $'

"""
Library to extend pywikipedia framework
"""

import re
import datetime
import query

from wikipedia import Page
from wikipedia import Site
from wikipedia import removeLanguageLinks
from wikipedia import removeCategoryLinks
from wikipedia import get_throttle

from family import Family


#
# class that will be added to Page
#

def GetSize(self):
    """
    Returns the actual size of page (excluding interwiki & category)
    """
    txt = self.get()
    txt = removeLanguageLinks(txt)
    txt = removeCategoryLinks(txt, site = self.site())
    return len(txt)

def getPageParameter(self):
    """
    Returns the page author & creation timestamp (including anonymous IP)
    """
    edits = self.getVersionHistory(reverseOrder=True, revCount=1)
    #print(edits)
    
    try:
        return edits[0]
    except IndexError:
        rev = self.latestRevision()
        user = self.userName()
        edtime = self.editTime()
        return rev, edtime, user, None

def getAuthor(self):
    """
    Returns the page author (including anonymous IP)
    """
    edits = self.getVersionHistory(reverseOrder = True)
#    edits = self.getVersionHistory()

#    return edits[len(edits)-1][2]
    return edits[0][2]

def getCreationTime(self):
    """
    Returns the page creation timestamp (including anonymous IP)
    """
    try:
        edits = self.getVersionHistory(reverseOrder = True)
    except:
        print("Error in getCreationTime(): page %s edits %s" % (self, edits) )

#    return edits[len(edits)-1][1]
    return edits[0][1]

#
# class that will be added to Site (in wikipedia.py)
#
def newpages_address_off(self, n=50, OffsetString=None):
        return self.family.newpages_address_offset(self.lang, n, OffsetString)

#
# class that will be added to Family  (in family.py)
#
def newpages_address_offset(self, code, limit=50, OffsetString=None):
        return "%s?title=%s:Newpages&limit=%d&offset=%s" % (self.path(code), self.special_namespace_url(code), limit, OffsetString)

#
# new class (derived from pagegenerators.py NewpagesPageGenerator)
#

def NewpagesPageGeneratorOffset(number = 100, OffsetStart = None, OffsetEnd = None, repeat = False, site = None, days = None):
    if site is None:
        site = wikipedia.getSite()
    for page in newpagesoffset(site=site, number=number, OffsetStart=OffsetStart, OffsetEnd=OffsetEnd, repeat=repeat, days = days):
        yield page[0]

def NewpagesPageGeneratorOffsetParam(number = 100, OffsetStart = None, OffsetEnd = None, repeat = False, site = None, days = None):
    if site is None:
        site = wikipedia.getSite()
    for (page, date, length, loggedIn, username, comment) in newpagesoffset(site=site, number=number, OffsetStart=OffsetStart, OffsetEnd=OffsetEnd, repeat=repeat, days = days):
        yield page, date, length, loggedIn, username, comment
        
#
# new class (derived from wikipedia.py Site.newpages)
#
def newpagesoffset_old(site = None, number = 10, repeat = False, OffsetString = None, days = None):
#def newpages(self, number = 10, repeat = False):
    """Generator which yields new articles subsequently.
       It starts with the article created 'number' articles
       ago (first argument). When these are all yielded
       and repeat is True,
       it fetches NewPages again. If there is no new page,
       it blocks until there is one, sleeping between subsequent
       fetches of NewPages.

       The objects yielded are dictionairies. The keys are
       date (datetime object), title (pagelink), length (int)
       user_login (only if user is logged in, string), comment
       (string) and user_anon (if user is not logged in, string).

    """
    # Begin --------------------------------------------  %Pier%
    # Attach at runtime
    Site.newpages_address_off = newpages_address_off
    Family.newpages_address_offset = newpages_address_offset

    UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
    # End   --------------------------------------------  %Pier%

    # The throttling is important here, so always enabled.
    if repeat:
        throttle = True
    seen = set()
    # Begin --------------------------------------------  %Pier%
    if OffsetString == None:
        DateStart = datetime.datetime.today() - datetime.timedelta(days=int(days))
        DateStart -= UTCOffset
    else:
        DateStart = datetime.datetime.strptime(OffsetString, "%Y%m%d%H%M%S") - datetime.timedelta(days=int(days))
        #DateStart = DateStart.replace(hour=23, minute=59, second=59)
        #DateStart -= UTCOffset
    #DateStart -= UTCOffset
    # End   --------------------------------------------  %Pier%

    while True:
        # Begin --------------------------------------------  %Pier%
        PageFound = False
        if OffsetString == None:
            path = site.newpages_address(n=number)
        else:
            path = site.newpages_address_off(n=number, OffsetString=OffsetString)
        # End   --------------------------------------------  %Pier%
        get_throttle()
        # Begin --------------------------------------------  %Pier%
        #html = self.getUrl(path)
        html = site.getUrl(path)
        # End   --------------------------------------------  %Pier%
        
#        entryR = re.compile('<li[^>]*>(?P<mw-newpages-time>.+?) \S*?<a href=".+?" title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<mw-newpages-length>\d+)[^\)\]]*[\)\]] .?<a href=".+?" title=".+?:(?P<username>.+?)"')
        entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
            ' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
            ' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
        for m in entryR.finditer(html):
            date = m.group('date')
            print date
            title = m.group('title')
            title = title.replace('&quot;', '"')
            length = int(m.group('length'))
            loggedIn = u''
            username = m.group('username')
            comment = u''

            if title not in seen:
                # Begin --------------------------------------------  %Pier%
                PageFound = True
                seen.add(title)
                #page = Page(self, title)
                page = Page(site, title)
                # End   --------------------------------------------  %Pier%
                yield page, date, length, loggedIn, username, comment

        if not repeat:
            break

        # Begin --------------------------------------------  %Pier%
        listaMesi={'gen': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'mag': '05', 'giu': '06', 'lug': '07', 'ago': '08', 'set': '09', 'ott': '10', 'nov': '11', 'dic': '12', }
        if days == None:
            days = 0

        oramin, giorno, mese, anno = date.split(' ')
        mese = int(listaMesi[mese])
        ora, minuti = oramin.replace(",", "").split(':')

        #DateStart = datetime.datetime.today() - datetime.timedelta(days=int(days))

        DatePage = datetime.datetime(int(anno), mese, int(giorno), int(ora), int(minuti))

        #UTCOffset = datetime.datetime.today() - datetime.datetime.utcnow()
        DatePage -= UTCOffset
        OffsetString = DatePage.strftime("%Y%m%d%H%M%S")

        if DatePage < DateStart:
            break

        # allow exit when NewPages ended
        if days > 0 and not PageFound:
            break
        # End   --------------------------------------------  %Pier%


#
# new class (derived from wikipedia.py Site.newpages)
#
def newpagesoffset(site = None, number = 10, get_redirect = False, repeat = False, namespace = 0, OffsetStart = None, OffsetEnd = None, days = None):
#def newpages(self, number = 10, get_redirect = False, repeat = False, namespace = 0):
    """Yield new articles (as Page objects) from Special:Newpages.

    Starts with the newest article and fetches the number of articles
    specified in the first argument. If repeat is True, it fetches
    Newpages again. If there is no new page, it blocks until there is
    one, sleeping between subsequent fetches of Newpages.

    The objects yielded are tuples composed of the Page object,
    timestamp (unicode), length (int), an empty unicode string, username
    or IP address (str), comment (unicode).

    """
    # TODO: in recent MW versions Special:Newpages takes a namespace parameter,
    #       and defaults to 0 if not specified.
    # TODO: Detection of unregistered users is broken
    # TODO: Repeat mechanism doesn't make much sense as implemented;
    #       should use both offset and limit parameters, and have an
    #       option to fetch older rather than newer pages

    seen = set()
    while True:
        if site.has_api() and site.versionnumber() >= 10:
            params = {
                'action': 'query',
                'list': 'recentchanges',
                'rctype': 'new',
            # Begin --------------------------------------------  %Pier%
                'rcstart': OffsetStart,
                'rcend': OffsetEnd,
            # End   --------------------------------------------  %Pier%
                'rcnamespace': namespace,
                'rclimit': int(number),
                'rcprop': ['ids','title','timestamp','sizes','user','comment'],
                'rcshow': ['!bot','!redirect'],
                #'': '',
            }
            data = query.GetData(params, site)['query']['recentchanges']
            
            # Begin --------------------------------------------  %Pier%
            PageFound = False
            # End   --------------------------------------------  %Pier%
            for np in data:
                if np['pageid'] not in seen:
                    seen.add(np['pageid'])
                    page = Page(site, np['title'], defaultNamespace=np['ns'])
                    # Begin --------------------------------------------  %Pier%
                    PageFound = True
                    # End   --------------------------------------------  %Pier%
                    yield page, np['timestamp'], np['newlen'], u'', np['user'], np['comment']
            # Begin --------------------------------------------  %Pier%
            OffsetStart = np['timestamp']
            if not PageFound:
                break
            # End   --------------------------------------------  %Pier%
        else:
            path = site.newpages_address(n=number, namespace=namespace)
            # The throttling is important here, so always enabled.
            get_throttle()
            html = site.getUrl(path)

            entryR = re.compile('<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
                ' title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
                ' .?<a href=".+?" title=".+?:(?P<username>.+?)">')
            for m in entryR.finditer(html):
                date = m.group('date')
                title = m.group('title')
                title = title.replace('&quot;', '"')
                length = int(re.sub("[,.]", "", m.group('length')))
                loggedIn = u''
                username = m.group('username')
                comment = u''

                if title not in seen:
                    seen.add(title)
                    page = Page(site, title)
                    yield page, date, length, loggedIn, username, comment
        if not repeat:
            break