User:Dschwen/Gifanalysisbot

< User:Dschwen
#!/usr/bin/python

import sys, os
print os.environ['HOME']
sys.path.append(os.environ['HOME'] + '/dschwen_bot/pywikipedia')
sys.path.append(os.environ['HOME'] + '/dschwen_bot/urlgrabber-3.1.0/urlgrabber')

import wikipedia
import MySQLdb
import pyexiv2
import re
import math
import string
import hashlib
import unicodedata
import htmlentitydefs 
import marshal
from PHPUnserialize import *
from datetime import timedelta
from datetime import datetime

import array
import urllib2
import byterange
range_handler = byterange.HTTPRangeHandler()
opener = urllib2.build_opener(range_handler)
urllib2.install_opener(opener)

useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

# look at images of the last three days

#older than
dt = timedelta(0)
cut1 = datetime.now() - dt

#but newer than
dt = timedelta(2)
cut2 = datetime.now() - dt

def unescape_charref(ref) :
        name = ref[2:-1]
        base = 10
        if name.startswith("x") :
                name = name[1:]
                base = 16
        return unichr(int(name, base))
                                          
def replace_entities(match) :
        ent = match.group()
        if ent[1] == "#":
                return unescape_charref(ent)
                                                              
        repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
        if repl is not None :
                repl = unichr(repl)
        else :
                repl = ent
        return repl
                                                                                                    
def unescape(data) : 
        return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) 



loc1RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\|]+)\|([^\|]+)\|' )
loc2RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\}\{]+)\}\}' )
loc3RE = re.compile( '\{\{[Ll]ocation\|([^\}\{]+)\}\}' )

site = wikipedia.getSite()


try:
        f = open( "taglist.gifbot", "rb" )
        taglist = marshal.load( f )
        f.close()
except:
        taglist = {}

try:
        connection = MySQLdb.connect(host="commonswiki-p.db.ts.wikimedia.org", read_default_file="/home/dschwen/.my.cnf", db="commonswiki_p" )
        cursor = connection.cursor() 
        print "Looking for GIF mimetype images (%s > images > %s)"  % ( cut1.strftime( "%Y%m%d%H%M%S" ), cut2.strftime( "%Y%m%d%H%M%S" ) )
        cursor.execute("""
SELECT img_name, img_width, img_height, img_size
FROM image
WHERE img_major_mime = 'image' AND img_minor_mime = 'gif'
AND img_media_type = 'BITMAP'
AND img_timestamp > %s and img_timestamp < %s
""",  ( cut2.strftime( "%Y%m%d%H%M%S" ), cut1.strftime( "%Y%m%d%H%M%S" ) ) )
        print "fetching results..."

        data = cursor.fetchall() 
        fields = cursor.description
        cursor.close()
        connection.close()

        #
        # get potential images from taglist
        #

        print "processing results..."
        for name, img_width, img_height, img_size in data:
                print name

                pixels = img_width * img_height

                if pixels == 0 :
                        print "0 pixels"
                        continue

                if img_size / pixels < 0.005 and name.lower().find( 'anim' ) == -1 :
                        print "bytes per pixel too low, unlikely candidate"
                        continue

                if taglist.has_key( name) and taglist[ name ] :
                        continue

                decomposed_string = unicodedata.normalize( 'NFD', name.decode('utf-8') )

                #page = wikipedia.Page(site, 'Image:' + decomposed_string.encode('utf-8') )
                page = wikipedia.Page(site, 'Image:' + name.decode('utf-8') )
                text = ""
                if page.exists() :
                        text = page.get(get_redirect=True)
        
                # already contains the Animated GIF category
                if string.find(text, '[[Category:Animated GIF' ) >= 0 :
                        print "Already tagged!"
                        continue

                bs = 8000
                of = -bs - 1
                nof = 0
                isAnimated = False
                reading = True
                
                # 0 header
                # 1 block
                # 2 data
                level = 0
                frames = 0

                #if img_width > 300 :
                #       m = hashlib.md5()
                #       m.update( name )
                #       h = m.hexdigest()
                #       url = "http://upload.wikimedia.org/wikipedia/commons/thumb/%s/%s/%s/120px-%s" % ( h[0], h[0:2], urllib2.quote(name), urllib2.quote(name) )
                #else :
                #       url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)
                url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)

                print "downloading %s ..." % url
                try:
                        while reading :
                                if nof - of >= bs - 10 :
                                        of = nof
                                        headers = { 'User-Agent' : useragent, 'Range' : 'bytes=%d-%d' % ( of, of+bs ) }
                                        req = urllib2.Request( url, '', headers )
                                        f = urllib2.urlopen(req)
                                        print "Downloading"
                                        t = f.read();
                                        a = array.array('B')
                                        a.fromstring(t)
                                        f.close()

                                if level == 0 and of > 0 :
                                        print "Mayday, no header info read!"
                                        break

                                if level == 0 :
                                        # analyze header
                                        if t[:6] != 'GIF89a' and t[:6] != 'GIF87a' :
                                                print "Not a GIF image"
                                                break

                                        # global color table flag
                                        gctf = ( a[10] & 128 ) >> 7
                                        print gctf

                                        # colors
                                        colors = 2 << ( a[10] & 7 )
        
                                        nof = 13 + 3 * gctf * colors
                                        print nof, a[nof]

                                        level = 1
                                        continue

                                if level == 1 :
                                        level = 2

                                        if a[nof-of] == 0x21 :
                                                print "Found Extension block!", a[nof-of+1]
                                                nof += 2
                                                continue

                                        if a[nof-of] == 0x2c :
                                                print "Found Image block!"
                                                frames += 1
                                                if frames > 1 :
                                                        isAnimated = True
                                                        break

                                                lctf = ( a[nof-of+8] & 128 ) >> 7
                                                colors = 2 << ( a[nof-of+8] & 7 )
                                                nof += 11 + 3 * lctf * colors
                                                continue

                                        if a[nof-of] == 0x3b :
                                                break

                                        print "Unknown Block found!", a[nof-of]
                                        break

                                if level == 2 :
                                        # block terminator
                                        if a[nof-of] == 0 :
                                                level = 1
                                        # print "  block size:", a[nof-of]
                                        nof += a[nof-of] + 1


                except Exception, e:
                        print "Exception while downloading:", e
                        continue

                if not isAnimated :
                        print "static GIF"
                        taglist[ name ] = True;
                        continue

                print "YAY! tagging..."
                try:
                        wikipedia.setAction("adding [[Category:Animated GIF files]]")

                        text = page.get(get_redirect=True)
                        text2 = text + "\n" + "[[Category:Animated GIF files]]"
                        page.put(text2)
                        taglist[ name ] = True;

                        file = open( "taglist.gifbot", "wb" )
                        marshal.dump( taglist, file )
                        file.close()
                except Exception, e:
                        print "Exception while tagging:", e


        file = open( "taglist.gifbot", "wb" )
        marshal.dump( taglist, file )
        file.close()

except MySQLdb.OperationalError, message: 
        errorMessage = "Error %d:\n%s" % (message[ 0 ], message[ 1 ] )
#