User:Dschwen/Gifanalysisbot

  1. !/usr/bin/python

import sys, os print os.environ['HOME'] sys.path.append(os.environ['HOME'] + '/dschwen_bot/pywikipedia') sys.path.append(os.environ['HOME'] + '/dschwen_bot/urlgrabber-3.1.0/urlgrabber')

import wikipedia import MySQLdb import pyexiv2 import re import math import string import hashlib import unicodedata import htmlentitydefs import marshal from PHPUnserialize import * from datetime import timedelta from datetime import datetime

import array import urllib2 import byterange range_handler = byterange.HTTPRangeHandler() opener = urllib2.build_opener(range_handler) urllib2.install_opener(opener)

useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

  1. look at images of the last three days
  1. older than

dt = timedelta(0) cut1 = datetime.now() - dt

  1. but newer than

dt = timedelta(2) cut2 = datetime.now() - dt

def unescape_charref(ref) :

       name = ref[2:-1]
       base = 10
       if name.startswith("x") :
               name = name[1:]
               base = 16
       return unichr(int(name, base))
                                         

def replace_entities(match) :

       ent = match.group()
       if ent[1] == "#":
               return unescape_charref(ent)
                                                             
       repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
       if repl is not None :
               repl = unichr(repl)
       else :
               repl = ent
       return repl
                                                                                                   

def unescape(data) :

       return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) 


loc1RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\|]+)\|([^\|]+)\|' ) loc2RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\}\{]+)\}\}' ) loc3RE = re.compile( '\{\{[Ll]ocation\|([^\}\{]+)\}\}' )

site = wikipedia.getSite()


try:

       f = open( "taglist.gifbot", "rb" )
       taglist = marshal.load( f )
       f.close()

except:

       taglist = {}

try:

       connection = MySQLdb.connect(host="commonswiki-p.db.ts.wikimedia.org", user="dschwen", passwd="xxx", db="commonswiki_p" )
       cursor = connection.cursor() 
       print "Looking for GIF mimetype images (%s > images > %s)"  % ( cut1.strftime( "%Y%m%d%H%M%S" ), cut2.strftime( "%Y%m%d%H%M%S" ) )
       cursor.execute( "select img_name, img_width, img_height, img_size from image where img_minor_mime = 'gif' and img_timestamp > '%s' and img_timestamp < '%s' " %  ( cut2.strftime( "%Y%m%d%H%M%S" ), cut1.strftime( "%Y%m%d%H%M%S" ) ) )
       #cursor.execute( "select img_name, img_width, img_height, img_size from image where img_minor_mime = 'gif'" )
       print "fetching results..."
       data = cursor.fetchall() 
       fields = cursor.description
       cursor.close()
       connection.close()
       #
       # get potential images from taglist
       #
       print "processing results..."
       for row in range(len(data)):
               name = data[row][0]
               print name
               pixels = data[row][1] * data[row][2]
               if pixels == 0 :
                       print "0 pixels"
                       continue
               if data[row][3] / pixels < 0.005 and name.lower().find( 'anim' ) == -1 :
                       print "bytes per pixel too low, unlikely candidate"
                       continue
               if taglist.has_key( name) and taglist[ name ] :
                       continue
               decomposed_string = unicodedata.normalize( 'NFD', name.decode('utf-8') )
               #page = wikipedia.Page(site, 'Image:' + decomposed_string.encode('utf-8') )
               page = wikipedia.Page(site, 'Image:' + name.decode('utf-8') )
               text = ""
               if page.exists() :
                       text = page.get(get_redirect=True)
       
               # already contains the Animated GIF category
               if string.find(text, '[[Category:Animated GIF' ) >= 0 :
                       print "Already tagged!"
                       continue
               bs = 8000
               of = -bs - 1
               nof = 0
               isAnimated = False
               reading = True
               
               # 0 header
               # 1 block
               # 2 data
               level = 0
               frames = 0
               #if data[row][1] > 300 :
               #       m = hashlib.md5()
               #       m.update( name )
               #       h = m.hexdigest()
               #       url = "http://upload.wikimedia.org/wikipedia/commons/thumb/%s/%s/%s/120px-%s" % ( h[0], h[0:2], urllib2.quote(name), urllib2.quote(name) )
               #else :
               #       url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)
               url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)
               print "downloading %s ..." % url
               try:
                       while reading :
                               if nof - of >= bs - 10 :
                                       of = nof
                                       headers = { 'User-Agent' : useragent, 'Range' : 'bytes=%d-%d' % ( of, of+bs ) }
                                       req = urllib2.Request( url, , headers )
                                       f = urllib2.urlopen(req)
                                       print "Downloading"
                                       t = f.read();
                                       a = array.array('B')
                                       a.fromstring(t)
                                       f.close()
                               if level == 0 and of > 0 :
                                       print "Mayday, no header info read!"
                                       break
                               if level == 0 :
                                       # analyze header
                                       if t[:6] != 'GIF89a' and t[:6] != 'GIF87a' :
                                               print "Not a GIF image"
                                               break
                                       # global color table flag
                                       gctf = ( a[10] & 128 ) >> 7
                                       print gctf
                                       # colors
                                       colors = 2 << ( a[10] & 7 )
       
                                       nof = 13 + 3 * gctf * colors
                                       print nof, a[nof]
                                       level = 1
                                       continue
                               if level == 1 :
                                       level = 2
                                       if a[nof-of] == 0x21 :
                                               print "Found Extension block!", a[nof-of+1]
                                               nof += 2
                                               continue
                                       if a[nof-of] == 0x2c :
                                               print "Found Image block!"
                                               frames += 1
                                               if frames > 1 :
                                                       isAnimated = True
                                                       break
                                               lctf = ( a[nof-of+8] & 128 ) >> 7
                                               colors = 2 << ( a[nof-of+8] & 7 )
                                               nof += 11 + 3 * lctf * colors
                                               continue
                                       if a[nof-of] == 0x3b :
                                               break
                                       print "Unknown Block found!", a[nof-of]
                                       break
                               if level == 2 :
                                       # block terminator
                                       if a[nof-of] == 0 :
                                               level = 1
                                       # print "  block size:", a[nof-of]
                                       nof += a[nof-of] + 1


               except Exception, e:
                       print "Exception while downloading:", e
                       continue
               if not isAnimated :
                       print "static GIF"
                       taglist[ name ] = True;
                       continue
               print "YAY! tagging..."
               try:
                       wikipedia.setAction("adding")
                       text = page.get(get_redirect=True)
                       text2 = text + "\n" + ""
                       page.put(text2)
                       taglist[ name ] = True;
                       file = open( "taglist.gifbot", "wb" )
                       marshal.dump( taglist, file )
                       file.close()
               except Exception, e:
                       print "Exception while tagging:", e


       file = open( "taglist.gifbot", "wb" )
       marshal.dump( taglist, file )
       file.close()

except MySQLdb.OperationalError, message:

       errorMessage = "Error %d:\n%s" % (message[ 0 ], message[ 1 ] )
Last modified on 21 June 2013, at 17:10