User:cool Blue Chicagobot

#! /usr/local/bin/python
# -*- coding: utf-8 -*-

"""
Wikipedia:WikiProject Chicago - Task-listing automation bot.

This bot will grab all the new tasks from the specified
category and add them to the tasks page in the identified section.

"""

import re, sys, string
import wikipedia, catlib, config

#**************
#* Variables: *
#**************

# Description of 'terms' in the new and open dict()s:
# category - Category to draw list of new tasks from.
# target - Target page to add list of new tasks to.
# section -
#   Section of target page to replace, which is delimited
#   by <!-- BEGIN [section] --> <!-- END [section] -->
# titlemask -
#   The title mask removes some portion of the title from
#   the link list.
# exclude -
#   Page exclusion regex.  List the pages that you don't
#   want to have included in the output list.
#   Example:
#   Template\:Medcab2$|Wikipedia\:Mediation Cabal\/Complaints$
# action - Update action text.

# Needing attention
open = dict()
open['category'] = 'Chicago articles needing attention'
open['target'] = 'Wikipedia:WikiProject Chicago/Automated taskboard'
open['section'] = 'NeedingAttention'
open['titlemask'] = r"^Wikipedia\:WikiProject Chicago\/Automated taskboard\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating taskboard"

# Needing photographs
open = dict()
open['category'] = 'Chicago articles needing photographs'
open['target'] = 'Wikipedia:WikiProject Chicago/Automated taskboard'
open['section'] = 'NeedingPhotographs'
open['titlemask'] = r"^Wikipedia\:WikiProject Chicago\/Automated taskboard\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating taskboard"

# Needing infoboxes
open = dict()
open['category'] = 'Chicago articles needing infoboxes'
open['target'] = 'Wikipedia:WikiProject Chicago/Automated taskboard'
open['section'] = 'NeedingInfoboxes'
open['titlemask'] = r"^Wikipedia\:WikiProject Chicago\/Automated taskboard\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating taskboard"

# Needing expansion
open = dict()
open['category'] = 'Stub-Class Chicago articles'
open['target'] = 'Wikipedia:WikiProject Chicago/Automated taskboard'
open['section'] = 'NeedingExpansion'
open['titlemask'] = r"^Wikipedia\:WikiProject Chicago\/Automated taskboard\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating taskboard"

# Needing assessed
open = dict()
open['category'] = 'Unassessed Chicago articles'
open['target'] = 'Wikipedia:WikiProject Chicago/Automated taskboard'
open['section'] = 'NeedingAssessed'
open['titlemask'] = r"^Wikipedia\:WikiProject Chicago\/Automated taskboard\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "Updating taskboard"

# This should be run infrequently, using relatively short delays for the processing.
wikipedia.get_throttle.setDelay(5) # 6 seconds
wikipedia.put_throttle.setDelay(5) # 6 seconds

# *******************
# * ChicagoBot Class*
# *******************
class ChicagoBot:
  def __init__(self):
    pass

  def run(self):
    if open['target'] == new['target']:

      wikipedia.output(u'Processing open tasks')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      open_page_data = self.process_category(open, new_page_data)

      # Check if the page has changed at all.
      if new_page_data != page_data or open_page_data != new_page_data:
        # If it has, update.
        action = u""
        if page_data != new_page_data:
          if new_page_data != open_page_data:
            action = new['action'] + ' & ' + open['action']
          else:
            action = new['action']
        else:
          action = open['action']
        wikipedia.output(u'Updating taskboard')
        wikipedia.setAction(action)
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Taskboard is already up-to-date')

    else:

      wikipedia.output(u'Processing New Tasks List')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      new_page_data = self.process_category(new, page_data)
      if new_page_data != page_data:
        wikipedia.output(u'Updating New Tasks List')
        wikipedia.setAction(new['action'])
        page_target.put(new_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'New Tasks List is already up-to-date')

      wikipedia.output(u'Processing New Tasks List')
      page_target = wikipedia.Page(wikipedia.getSite(), open['target'])
      page_data = page_target.get()
      open_page_data = self.process_category(open, page_data)
      if open_page_data != page_data:
        wikipedia.output(u'Updating Open Tasks List')
        wikipedia.setAction(open['action'])
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Open Tasks List is already up-to-date')

  def process_category(self, pgt, page_data):
    # Populate local variables
    category = pgt['category']
    section = pgt['section']
    titlemask = pgt['titlemask']
    exclude = pgt['exclude']

    # Setup Regular Expressions used later.
    exclude_regex = re.compile(exclude)
    titlemask_regex = re.compile(titlemask)

    # Create instance of catlib object and specify category.
    cat = catlib.Category(wikipedia.getSite(), 'Category:' + category)

    # Get array of pages in category.
    pages = cat.articles()
    #pages.reverse() # Change to descending date order

    # Initialize variables.
    total = len(pages)
    count = 0

    # Check if there are any pages in the category.
    if total == 0:
      # If the number of pages is zero output status.
      wikipedia.output('Category:' + category + ' is empty, doing nothing.')
      return page_data
    else:
      # Initialize variables.
      pagelist = u"\n"

      # Otherwise, process the pages to produce a page.
      wikipedia.output(u'Now processing ' + str(total) + ' Chicago task pages.')

      # Loop through all pages.
      for page in pages:
        title = page.title()
        count = count + 1
        # Check to see whether it's in the exclude list.
        if exclude_regex.match(title):
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title + ' - Skipping')
        else:
          # If not in the exclude list, add to the pagelist.
          # Output status line.
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title)
          # Add the page title to the page list.
          pagelist = pagelist + u'* [[' + title + '|' + titlemask_regex.sub('', title) + ']]'

          hist = page.getVersionHistory()
          print hist

          # Finish the formatting of the pagelist.
      pagelist = u'<!-- BEGIN ' + section + ' -->' + pagelist + '<!-- END ' + section + ' -->'

      # Setup regex to find replaced region.
      start = r'\<\!\-\- BEGIN ' + section + ' \-\-\>'
      end = r'\<\!\-\- END ' + section + ' \-\-\>'

      # Run replacement and place in new variable.
      return re.compile(start + r'.*?' + end, re.S).sub(pagelist, page_data)

  # WARNING: get_tmpl_params() is really scary.
  # If there's an efficient regex for parsing out templates, I'd love to have it.
  def get_tmpl_params(self, page, tmpl_name):
    # Compile regexes.
    tmpl_open = re.compile(r'\{\{', re.I | re.S)
    tmpl_close = re.compile(r'\}\}', re.I | re.S)

    # Find start of string.
    m = re.compile(r'\{\{' + tmpl_name + '\W*?\|', re.I | re.S).search(page_data)

    # Only do processing if the search was successful.
    if m:
      # Set the start point for the parameter list.
      param_start = m.end()
      # Set the end point for the parameter list, which will iterate up if
      # subtemplates are found within the template definition.
      param_end = tmpl_close.search(page_data, param_start).end()
      # Set the temporary search results variable for the next template
      # opening delimiter.
      m = tmpl_open.search(page_data, param_start)
      # Since this could fail, verify that this result can be compared.
      if m:
        # Set the param_open variable to the last found template
        # opening delimiter.
        param_open = m.end()
        # While the end point for the parameter range is greater than
        # the end point of the last search for a template opening
        # delimiter we know that there is a subtemplate to identify.
        # This assumes that the templates are properly nested.
        while param_open < param_end:
          # Set the temporary search results variable to the next
          # template opening delimiter.
          m = tmpl_open.search(page_data, param_end)
          # Logic to set the param_open variable.
          if m:
            # Search was successful, set to end() value.
            param_open = m.end()
          else:
            # Search was failure, exit loop.
            break
          # Sets the new end point for the parameter range.
          param_end = tmpl_close.search(page_data, param_end).end()

      # Remove the closing template delimiter.
      param_end = param_end - 2

      # Declare parameters dict()
      params = dict()
      # Loop through each parameter.
      for param in re.split(r"\n[\|]*",page_data[param_start:param_end]):
        # Only try splitting and adding to the params if not blank.
        if param != '':
          # Split only on the first equal sign.
          temp = param.split('=', 1)
          # Add entry for this parameter.
          params[temp[0].strip()] = temp[1].strip()

      # Debugging output.
      print params

      # Return the dict()
      return params

    else:
      # Failed, return the results of the failed match()
      return m

if __name__ == "__main__":
  try:
    bot = ChicagoBot()
    bot.run()
  finally:
    wikipedia.stopme()