User:JarektUploadBot/FixWGAMetadataInfo.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
'''
A program to update file description of images from Web Gallery of Art website at http://www.wga.hu/
which were manually uploaded to commons. This code is for files using {{Information}} template
 
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
sys.path.append("C:/Programs/pywikipedia/")
sys.path.append("../")
import wikipedia, upload, csv, urllib2, string, catlib

def processFile(row):
   # Read line of metadata
   enc='utf-8' 
   metadata = {
       'IMG_ID'          : int    (row.get(u'IMG_ID')              ),       
       'CREATOR'         : unicode(row.get(u'CREATOR')        , enc),       
       'DATE'            : unicode(row.get(u'DATE')           , enc),
       'TITLE'           : unicode(row.get(u'TITLE')          , enc),
       'DIMENSIONS'      : unicode(row.get(u'DIMENSIONS')     , enc),
       'TECHNIQUE'       : unicode(row.get(u'TECHNIQUE')      , enc),
       'FILENAME'        : unicode(row.get(u'FILENAME')       , enc),
       'FILENAME2'       : unicode(row.get(u'FILENAME2')      , enc),
       'FORM'            : unicode(row.get(u'FORM')           , enc),
       'TYPE'            : unicode(row.get(u'TYPE')           , enc),
       'SCHOOL'          : unicode(row.get(u'SCHOOL')         , enc),
       'TIMELINE'        : unicode(row.get(u'TIMELINE')       , enc),
       'INSTITUTION'     : unicode(row.get(u'INSTITUTION')    , enc),
       'CREATOR_CAT'     : unicode(row.get(u'CREATOR_CAT')    , enc),
       'INSTITUTION_CAT' : unicode(row.get(u'INSTITUTION_CAT'), enc),
       'TITLE_CAT'       : unicode(row.get(u'TITLE_CAT')      , enc),
       'DATE_CAT'        : unicode(row.get(u'DATE_CAT')       , enc),
       'URL'             : unicode(row.get(u'URL')            , enc),
       'IMAGEURL'        : unicode(row.get(u'IMAGEURL')       , enc),
       'FRAME'           : unicode(row.get(u'FRAME')          , enc),
       }
   metadata['FORM1']       = metadata['FORM'].capitalize();
   metadata['FILENAME2']   = 'File:'+metadata['FILENAME2'].strip();
   metadata['CREATOR']     = metadata['CREATOR'].strip();
   metadata['INSTITUTION'] = metadata['INSTITUTION'].strip();
    
   # Format file description
   article_template = u"""{{Artwork     
  |artist           = %(CREATOR)s
  |title            = {{en|%(TITLE)s}}
  |description      = 
  |date             = %(DATE)s
  |medium           = %(TECHNIQUE)s
  |dimensions       = %(DIMENSIONS)s
  |institution      = %(INSTITUTION)s
  |location         = <!-- location within the gallery/museum -->     
  |references       =
  |object history   =
  |credit line      =
  |inscriptions     =
  |notes            = 
  |accession number =
  |source           = {{WGA link|ID=%(IMG_ID)s|pic-url=%(IMAGEURL)s|info-url=%(URL)s}}     
  |permission       = {{PD-art|PD-old-100}}
  |other_versions   =
}}
%(FRAME)s
{{WGA tag|%(FORM)s|%(TYPE)s|%(SCHOOL)s|%(TIMELINE)s}}     
[[Category:WGA form: %(FORM)s]]
[[Category:WGA type: %(TYPE)s]]
[[Category:WGA School: %(SCHOOL)s]]
[[Category:WGA time period: %(TIMELINE)s]]
"""
   description = article_template % metadata

   # Get current file categories
   targetSite = wikipedia.getSite('commons', 'commons')
   page = wikipedia.Page(targetSite, metadata['FILENAME2'])
   desc = page.get()
   wikipedia.output("================================================================================" )    
   wikipedia.output("=== BEFORE =====================================================================" )
   wikipedia.output("================================================================================" )    
   wikipedia.output(desc)

   #get files categories and parent categories of those
   parentCats=''
   for m in re.finditer("\[\[[Cc]ategory:([^\]\|]*)", desc):
     cat = u'Category:%s\n'%m.groups(0)[0]
     parentCats += cat
     catO = catlib.Category(targetSite,cat)
     for parent in catO.supercategoriesList():
       parentCats += parent.title() +'\n'
   wikipedia.output("================================================================================" )    
   wikipedia.output("=== parent =====================================================================" )
   wikipedia.output("================================================================================" )    
   wikipedia.output(parentCats)
       
   #get file's categories in wikitext format with sort order (if any)       
   cats = ''
   for m in re.finditer("\[\[[Cc]ategory:([^\]]*)", desc):
     if (m.groups(0)[0] not in cats):
       cats += u'[[Category:%s]]\n'%m.groups(0)[0]
  
   #If try adding categories to the existing cats set 
   cat = u'%(TITLE_CAT)s'%metadata
   if (cat not in parentCats):
      cats += '\n[[Category:' + cat + ']]'

   cat = u'%(DATE_CAT)s'%metadata
   if (cat not in parentCats):
      cats += '\n[[Category:' + cat + ']]'

   cat = u'%(CREATOR_CAT)s'%metadata
   p=cat.rpartition(' ')
   if (p[2] not in parentCats):
      cats += u'\n{{subst:#ifexist:Category:%(FORM1)ss by %(CREATOR_CAT)s|[[Category:%(FORM1)ss by %(CREATOR_CAT)s]]|[[Category:%(CREATOR_CAT)s]]}}'%metadata
      
   cat = u'%(INSTITUTION_CAT)s'%metadata
   if (cat not in parentCats):
      cats += u'\n{{subst:#ifexist:Category:%(FORM1)ss in the %(INSTITUTION_CAT)s|[[Category:%(FORM1)ss in the %(INSTITUTION_CAT)s]]|[[Category:%(INSTITUTION_CAT)s]]}}'%metadata

   cats = string.replace(cats, " |", "|")
   cats = string.replace(cats, "| ", "|")
   cats = string.replace(cats, "Paintings by  ", "Paintings by ")
   cats = string.replace(cats, "[[Category:]]\n", "")
   cats = string.replace(cats, "[[Category:]]", "")
   cats = string.replace(cats, "[[Category: ", "[[Category:")
   description = description + cats
   description = string.replace(description, "{{}}", "")
   description = string.replace(description, "\n\n", "\n")

   wikipedia.output("================================================================================" )    
   wikipedia.output("=== AFTER ======================================================================" )
   wikipedia.output("================================================================================" )    
   wikipedia.output(description)    
   page.put(description, "Update metadata and categories", None, False)
        
def main(args):
   csvFile = 'WGA_infobatch1b.csv'
 
   reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
   try:
     for row in reader:
       processFile(row)
   except csv.Error, e:
     sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
   
   
if __name__ == "__main__":
   try:
       main(sys.argv[1:])
   finally:
       print "All done!"