User:JarektUploadBot/Upload Open Access Images.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
'''
A program to upload Open Access Images based on CSV file
 
'''
import sys, os.path, glob, re, hashlib, base64, StringIO
import wikipedia, upload, csv, urllib2, string, catlib
 
def processFile(row):
   # Read line of metadata
   enc='utf-8' 
   metadata = {
    u'description'   : unicode(row.get(u'en')            , enc),
    u'date'          : unicode(row.get(u'date')          , enc),
    u'filename'      : unicode(row.get(u'file location') , enc),
    u'author'        : unicode(row.get(u'author')        , enc),
    u'source'        : unicode(row.get(u'source')        , enc),
    u'categories'    : unicode(row.get(u'categories')    , enc),
    u'journal'       : unicode(row.get(u'journal')       , enc),
    u'permission'    : unicode(row.get(u'permission')    , enc),
   }
 
   # Format file description
   template = u"""{{subst:User:JarektUploadBot/Open Access Subst
 |description    = %(description)s
 |date           = %(date)s
 |source         = %(source)s
 |author         = %(author)s
 |permission     = %(permission)s
 |other_versions =
 |journal        = %(journal)s
 |category00     = %(category00)s
 |category01     = %(category01)s
 |category02     = %(category02)s
 |category03     = %(category03)s
 |category04     = %(category04)s
 |category05     = %(category05)s
 |category06     = %(category06)s
 |category07     = %(category07)s
 |category08     = %(category08)s
 |category09     = %(category09)s
 |category10     = %(category10)s
 |category11     = %(category11)s
 |category12     = %(category12)s
 |category13     = %(category13)s
 |category14     = %(category14)s
 |category15     = %(category15)s
 |category16     = %(category16)s
 |category17     = %(category17)s
 |category18     = %(category18)s
 |category19     = %(category19)s
}}
"""
   n=0
   targetSite = wikipedia.getSite('commons', 'commons')
   catlist = metadata['categories'].split('\n')
   metadata['category00'] = ''
   for cat in catlist:
     n = n+1
     metadata['category'+"%02i" % n] = cat.strip()
     cat_page =catlib.Category(targetSite, cat.strip())
     if not cat_page.exists():
        metadata['category00'] = 'File with non-existing species category'

   for i in range(19-n):
     n = n+1
     metadata['category'+"%02i" % n] = ''

   fname = metadata['filename'].rsplit('/',1)[1]
   
   if len(metadata['category01'])>0:
      fname = metadata['category01'] + ' - ' + fname        

   metadata['date'] = re.sub(u'(\d\d\d\d)-(\d)-(\d\d?)', ur'\1-0\2-\3', metadata['date'])
   metadata['date'] = re.sub(u'(\d\d\d\d)-(\d\d?)-(\d)', ur'\1-\2-0\3', metadata['date'])
   description = template % metadata
   wikipedia.output(fname)
   wikipedia.output(description)
 
   # We don't want to upload duplicates
   # So take the photo, calculate the SHA1 hash and ask the mediawiki api for a list of duplicates.
   imageFile  = urllib2.urlopen(metadata['filename']).read()
   photo      = StringIO.StringIO(imageFile)
   hashObject = hashlib.sha1()
   hashObject.update(photo.getvalue())
   SHA1       = base64.b16encode(hashObject.digest())
   duplicates = targetSite.getFilesFromAnHash(SHA1)
 
   if duplicates:
      str = duplicates.pop()
      wikipedia.output(u'Duplicate image: %s' % str + u' = %(FILENAME)s' % metadata)
      return 
   
   # upload file to Commons
   bot = upload.UploadRobot(url=metadata['filename'], 
           description  = description, 
           useFilename  = fname, 
           keepFilename = True, 
           verifyDescription=False, 
           targetSite   = targetSite)
   bot.run() # Comment out this line to have a dry run


 
def main(args):
   csvFile = 'Open Access1.csv'
 
   reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
   i=0
   try:
    for row in reader:
        #print 'Row read successfully:', row
        processFile(row)
        i=i+1
        if i==-1: # Set a number of files to upload, or set to -1 for all files
           return
 
   except csv.Error, e:
     sys.exit('file %s, line %d: %s' % (csvFile, reader.line_num, e))
 
 
if __name__ == "__main__":
   try:
       main(sys.argv[1:])
   finally:
       print "All done!"