Utente:GiacoBot/Replace2.py

Replace2.py dal pacchetto pywikipedia modificato su suggerimento di XXL1986 per il fix di tutte le forme accentate (perchè, poichè, checchè, affinchè etc). Probabilmente si potrebbe anche alleggerire rimuovendo un sacco di fix specifici pensati per de.wiki e altro...
# -*- coding: utf-8  -*-
#
# (C) Daniel Herding, 2004
# Distributed under the terms of the MIT license.
#
# 
# 1.Se vuoi aggiungere un nuovo fix, copia le righe seguenti (da INIZIO FIX a FINE FIX) ed incollale subito prima del testo seguente
#INIZIO FIX (riga da cancellare)
# 'INSERIRE IL TITOLO DEL FIX': {
#        'regex': True,
#        'msg': {
#               'it':u' Bot: COSA FA IL FIX - QUESTO TESTO VA NELL'OGGETTO MODIFICHE'
#              },
#        'replacements': [
# (u"(?m)REGEX INIZIALE (DA SOSTITUIRE)", ur"REGEX FINALE (SOSTITUITA)"),
#        ]
#                },
#FINE FIX (riga da cancellare)
#
# 2.Modifica ed incolla il tutto subito prima della seguente sintassi
#     }
#     class XmlDumpReplacePageGenerator:
#
__version__='$Id: replace.py,v 1.102 2006/03/12 16:35:54 wikipedian Exp $'

from __future__ import generators
import sys, re
import wikipedia, pagegenerators, catlib, config

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
msg = {
       'de':u'Bot: Automatisierte Textersetzung %s',
       'en':u'Robot: Automated text replacement %s',
       'es':u'Robot: Reemplazo automático de texto %s',
       'fr':u'Bot : Remplacement de texte automatisé %s',
       'hu':u'Robot: Automatikus szövegcsere %s',
       'ia':u'Robot: Reimplaciamento automatic de texto %s',
       'is':u'Vélmenni: breyti texta %s',
       'it':u'Bot: Sostituzione automatica di testo %s',
       'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
       'lt':u'Botas: Automatinis teksto keitimas %s',       
       'pt':u'Bot: Mudança automática %s',
       'sr':u'Бот: Аутоматска замена текста %s',
       }

# Predefined replacements tasks.
fixes = {
    # These replacements will convert HTML to wiki syntax where possible, and
    # make remaining tags XHTML compliant.
    'HTML': {
        'regex': True,
        # We don't want to mess up pages which discuss HTML tags, so we skip
        # all pages which contain nowiki tags.
        'exceptions':  ['<nowiki>'],
        'msg': {
               'en':u'Robot: converting/fixing HTML',
               'de':u'Bot: konvertiere/korrigiere HTML',
               'ia':u'Robot: conversion/reparation de HTML',
               'it':u'Bot: sostituzione perchè>>perché e termini accentati affini',
               'pt':u'Bot: Corrigindo HTML',
               'sr':u'Бот: Поправка HTML-а'
              },
        'replacements': [
            # Everything case-insensitive (?i)
            # Keep in mind that MediaWiki automatically converts <br> to <br />
            # when rendering pages, so you might comment the next two lines out
            # to save some time/edits.
            #r'(?i)<br>':                      r'<br />',
            # linebreak with attributes
            #r'(?i)<br ([^>/]+?)>':            r'<br \1 />',
            (r'(?i)<b>(.*?)</b>',              r"'''\1'''"),
            (r'(?i)<strong>(.*?)</strong>',    r"'''\1'''"),
            (r'(?i)<i>(.*?)</i>',              r"''\1''"),
            (r'(?i)<em>(.*?)</em>',            r"''\1''"),
            # horizontal line without attributes in a single line
            (r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2'),
            # horizontal line without attributes with more text in the same line
            (r'(?i) +<hr[ /]*> +',             r'\r\n----\r\n'),
            # horizontal line with attributes; can't be done with wiki syntax
            # so we only make it XHTML compliant
            (r'(?i)<hr ([^>/]+?)>',            r'<hr \1 />'),
            # a header where only spaces are in the same line
            (r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])',  r"\1= \2 =\3"),
            (r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])',  r"\1== \2 ==\3"),
            (r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])',  r"\1=== \2 ===\3"),
            (r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])',  r"\1==== \2 ====\3"),
            (r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])',  r"\1===== \2 =====\3"),
            (r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])',  r"\1====== \2 ======\3"),
            # TODO: maybe we can make the bot replace <p> tags with \r\n's.
        ]
    },
    'case-de': { # German upper / lower case issues
        'regex': True,
        'msg': {
               'de':u'Bot: Korrigiere Groß-/Kleinschreibung',
              },
        'replacements': [
            (r'\batlantische(r|n|) Ozean', r'Atlantische\1 Ozean'),
            (r'\bdeutsche(r|n|) Bundestag\b', r'Deutsche\1 Bundestag'),
            (r'\bdeutschen Bundestags\b', r'Deutschen Bundestags'), # Aufpassen, z. B. 'deutsche Bundestagswahl'
            (r'\bdeutsche(r|n|) Reich\b', r'Deutsche\1 Reich'),
            (r'\bdeutschen Reichs\b', r'Deutschen Reichs'), # Aufpassen, z. B. 'deutsche Reichsgrenzen'
            (r'\bdritte(n|) Welt', r'Dritte\1 Welt'),
            (r'\bdreißigjährige(r|n|) Krieg', r'Dreißigjährige\1 Krieg'),
            (r'\beuropäische(n|) Gemeinschaft', r'Europäische\1 Gemeinschaft'),
            (r'\beuropäische(n|) Kommission', r'Europäische\1 Kommission'),
            (r'\beuropäische(n|) Parlament', r'Europäische\1 Parlament'),
            (r'\beuropäische(n|) Union', r'Europäische\1 Union'),
            (r'\berste(r|n|) Weltkrieg', r'Erste\1 Weltkrieg'),
            (r'\bkalter(r|n|) Krieg', r'Kalte\1 Krieg'),
            (r'\bpazifische(r|n|) Ozean', r'Pazifische\1 Ozean'),
            (r'Tag der deutschen Einheit', r'Tag der Deutschen Einheit'),
            (r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'),
        ],
        'exceptions':  ['sic!'],
    },
    #Fix che sistema gli accenti di termini accentati come perchè, poichè, affinchè etc.
'tutti-che': {
       'regex': True,
       'msg': {
              'it':u' Bot: inversione accenti'
             },
       'replacements': [
(u"(?m)([Cc]hec|[Pp]er|[Aa]ffin|[Aa]lcun|[Gg]iac|[Ss]ic|[Ff]in|[Pp]oi|[Bb]en|[Aa]ccioc|[Ff]uor|[Mm]ac)ch(è|e['])([^'.])", ur"\1ché\3"),
       ]
               },
    }
class XmlDumpReplacePageGenerator:
    """
    Generator which will yield Pages to pages that might contain text to
    replace. These pages will be retrieved from a local XML dump file
    (cur table).
    """
    def __init__(self, xmlFilename, replacements, exceptions):
        """
        Arguments:
            * xmlFilename  - The dump's path, either absolute or relative
            * replacements - A list of 2-tuples of original text (as a compiled
                             regular expression) and replacement text (as a
                             string).
            * exceptions   - A list of compiled regular expression; pages which
                             contain text that matches one of these won't be
                             changed.
        """

        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
    
    def __iter__(self):
        import xmlreader
        mysite = wikipedia.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        for entry in dump.parse():
            skip_page = False
            for exception in self.exceptions:
                if exception.search(entry.text):
                    skip_page = True
                    break
            if not skip_page:
                # TODO: leave out pages that only have old inside nowiki, comments, math
                for old, new in self.replacements:
                    if old.search(entry.text):
                        yield wikipedia.Page(mysite, entry.title)
                        break
    

class ReplaceRobot:
    """
    A bot that can do text replacements.
    """
    def __init__(self, generator, replacements, exceptions = [], acceptall = False):
        """
        Arguments:
            * generator    - A generator that yields Page objects.
            * replacements - A list of 2-tuples of original text (as a compiled
                             regular expression) and replacement text (as a 
                             string).
            * exceptions   - A list of compiled regular expression; pages which
                             contain text that matches one of these won't be
                             changed.
            * acceptall    - If True, the user won't be prompted before changes
                             are made.
        """
        self.generator = generator
        self.replacements = replacements
        self.exceptions = exceptions
        self.acceptall = acceptall

    def checkExceptions(self, original_text):
        """
        If one of the exceptions applies for the given text, returns the 
        substring which matches the exception. Otherwise it returns None.
        """
        for exception in self.exceptions:
            hit = exception.search(original_text)
            if hit:
                return hit.group(0)
        return None

    def doReplacements(self, original_text):
        """
        Returns the text which is generated by applying all replacements to the
        given text.
        """
        new_text = original_text
        for old, new in self.replacements:
            new_text = wikipedia.replaceExceptMathNowikiAndComments(new_text, old, new)
        return new_text
        
    def run(self):
        """
        Starts the robot.
        """
        # Run the generator which will yield Pages which might need to be
        # changed.
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                original_text = page.get()
                if not page.canBeEdited():
                    wikipedia.output(u'Skipping locked page %s' % page.title())
                    continue
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' % page.title())
                continue
            except wikipedia.IsRedirectPage:
                original_text = page.get(get_redirect=True)
            match = self.checkExceptions(original_text)
            # skip all pages that contain certain texts
            if match:
                wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match))
            else:
                new_text = self.doReplacements(original_text)
                if new_text == original_text:
                    wikipedia.output('No changes were necessary in %s' % page.title())
                else:
                    wikipedia.output(u'\n>>> %s <<<' % page.title())
                    wikipedia.showDiff(original_text, new_text)
                    if not self.acceptall:
                        choice = wikipedia.inputChoice(u'Do you want to accept these changes?',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                        if choice in ['a', 'A']:
                            self.acceptall = True
                    if self.acceptall or choice in ['y', 'Y']:
                        page.put(new_text)

def prepareRegexForMySQL(pattern):
    pattern = pattern.replace('\s', '[:space:]')
    pattern = pattern.replace('\d', '[:digit:]')
    pattern = pattern.replace('\w', '[:alnum:]')
    
    pattern = pattern.replace("'", "\\" + "'")
    #pattern = pattern.replace('\\', '\\\\')
    #for char in ['[', ']', "'"]:
    #    pattern = pattern.replace(char, '\%s' % char)
    return pattern
    
                        
def main():
    gen = None
    # How we want to retrieve information on which pages need to be changed.
    # Can either be 'xmldump', 'textfile' or 'userinput'.
    source = None
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A list of 2-tuples of original text and replacement text.
    replacements = []
    # Don't edit pages which contain certain texts.
    exceptions = []
    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used when source
    # is 'xmldump'.
    xmlFilename = None
    useSql = False
    # the textfile's path, either absolute or relative, which will be used when
    # source is 'textfile'.
    textfilename = None
    # the category name which will be used when source is 'category'.
    categoryname = None
    # pages which will be processed when the -page parameter is used
    PageTitles = []
    # a page whose referrers will be processed when the -ref parameter is used
    referredPageTitle = None
    # a page whose links will be processed when the -links parameter is used
    linkingPageTitle = None
    # will become True when the user presses a ('yes to all') or uses the -always
    # commandline paramater.
    acceptall = False
    # Which namespaces should be processed?
    # default to [] which means all namespaces will be processed
    namespaces = []
    # Which page to start
    startpage = None
    # Google query
    googleQuery = None
    # Load default summary message.
    wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))

    # Read commandline parameters.
    for arg in wikipedia.handleArgs():
        if arg == '-regex':
            regex = True
        elif arg.startswith('-file'):
            if len(arg) >= 6:
                textfilename = arg[6:]
            gen = pagegenerators.TextfilePageGenerator(textfilename)
        elif arg.startswith('-cat'):
            if len(arg) == 4:
                categoryname = wikipedia.input(u'Please enter the category name:')
            else:
                categoryname = arg[5:]
            cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
            gen = pagegenerators.CategorizedPageGenerator(cat)
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        elif arg =='-sql':
            useSql = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
            else:
                PageTitles.append(arg[6:])
            source = 'specificPages'
        elif arg.startswith('-ref'):
            if len(arg) == 4:
                referredPageTitle = wikipedia.input(u'Links to which page should be processed?')
            else:
                referredPageTitle = arg[5:]
            referredPage = wikipedia.Page(wikipedia.getSite(), referredPageTitle)
            gen = pagegenerators.ReferringPageGenerator(referredPage)
        elif arg.startswith('-links'):
            if len(arg) == 6:
                linkingPageTitle = wikipedia.input(u'Links from which page should be processed?')
            else:
                linkingPageTitle = arg[7:]
            linkingPage = wikipedia.Page(wikipedia.getSite(), linkingPageTitle)
            gen = pagegenerators.LinkedPageGenerator(linkingPage)
        elif arg.startswith('-start'):
            if len(arg) == 6:
                firstPageTitle = wikipedia.input(u'Which page do you want to chage?')
            else:
                firstPageTitle = arg[7:]
            namespace = wikipedia.Page(wikipedia.getSite(), firstPageTitle).namespace()
            gen = pagegenerators.AllpagesPageGenerator(firstPageTitle, namespace)
        elif arg.startswith('-google'):
            if len(arg) >= 8:
                googleQuery = arg[8:]
            gen = pagegenerators.GoogleSearchPageGenerator(googleQuery)
        elif arg.startswith('-except:'):
            exceptions.append(arg[8:])
        elif arg.startswith('-fix:'):
            fix = arg[5:]
        elif arg == '-always':
            acceptall = True
        elif arg.startswith('-namespace:'):
            namespaces.append(int(arg[11:]))
        else:
            commandline_replacements.append(arg)

    if (len(commandline_replacements) == 2 and fix == None):
        replacements.append((commandline_replacements[0], commandline_replacements[1]))
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')')
    elif fix == None:
        old = wikipedia.input(u'Please enter the text that should be replaced:')
        new = wikipedia.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements.append((old, new))
        while True:
            old = wikipedia.input(u'Please enter another text that should be replaced, or press Enter to start:')
            if old == '':
                change = change + ')'
                break
            new = wikipedia.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements.append((old, new))
        default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
        wikipedia.output(u'The summary message will default to: %s' % default_summary_message)
        summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:')
        if summary_message == '':
            summary_message = default_summary_message
        wikipedia.setAction(summary_message)
    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes[fix]
        except KeyError:
            wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys())
            wikipedia.stopme()
            sys.exit()
        if fix.has_key('regex'):
            regex = fix['regex']
        if fix.has_key('msg'):
            wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), fix['msg']))
        if fix.has_key('exceptions'):
            exceptions = fix['exceptions']
        replacements = fix['replacements']

    
    # already compile all regular expressions here to save time later
    for i in range(len(replacements)):
        old, new = replacements[i]
        if not regex:
            old = re.escape(old)
        oldR = re.compile(old, re.UNICODE)
        replacements[i] = oldR, new
    for i in range(len(exceptions)):
        exception = exceptions[i]
        if not regex:
            exception = re.escape(exception)
        exceptionR = re.compile(exception, re.UNICODE)
        exceptions[i] = exceptionR
    
    if xmlFilename:
        gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions)
    elif useSql:
        whereClause = 'WHERE (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) 
        if exceptions:
            exceptClause = 'AND NOT (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions])
        else:
            exceptClause = ''
        query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
        gen = pagegenerators.MySQLPageGenerator(query)

    elif PageTitles:
        pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
        gen = iter(pages)

    if not gen:
        # syntax error, show help text from the top of this file
        wikipedia.output(__doc__, 'utf-8')
        wikipedia.stopme()
        sys.exit()
    if namespaces != []:
        gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
    bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall)
    bot.run()


if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()