#!/usr/bin/python
import sys, os
print os.environ['HOME']
sys.path.append(os.environ['HOME'] + '/dschwen_bot/pywikipedia')
sys.path.append(os.environ['HOME'] + '/dschwen_bot/urlgrabber-3.1.0/urlgrabber')
import wikipedia
import MySQLdb
import pyexiv2
import re
import math
import string
import hashlib
import unicodedata
import htmlentitydefs
import marshal
from PHPUnserialize import *
from datetime import timedelta
from datetime import datetime
import array
import urllib2
import byterange
range_handler = byterange.HTTPRangeHandler()
opener = urllib2.build_opener(range_handler)
urllib2.install_opener(opener)
useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
# look at images of the last three days
#older than
dt = timedelta(0)
cut1 = datetime.now() - dt
#but newer than
dt = timedelta(2)
cut2 = datetime.now() - dt
def unescape_charref(ref) :
name = ref[2:-1]
base = 10
if name.startswith("x") :
name = name[1:]
base = 16
return unichr(int(name, base))
def replace_entities(match) :
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent)
repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
if repl is not None :
repl = unichr(repl)
else :
repl = ent
return repl
def unescape(data) :
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
loc1RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\|]+)\|([^\|]+)\|' )
loc2RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\}\{]+)\}\}' )
loc3RE = re.compile( '\{\{[Ll]ocation\|([^\}\{]+)\}\}' )
site = wikipedia.getSite()
try:
f = open( "taglist.gifbot", "rb" )
taglist = marshal.load( f )
f.close()
except:
taglist = {}
try:
connection = MySQLdb.connect(host="commonswiki-p.db.ts.wikimedia.org", read_default_file="/home/dschwen/.my.cnf", db="commonswiki_p" )
cursor = connection.cursor()
print "Looking for GIF mimetype images (%s > images > %s)" % ( cut1.strftime( "%Y%m%d%H%M%S" ), cut2.strftime( "%Y%m%d%H%M%S" ) )
cursor.execute("""
SELECT img_name, img_width, img_height, img_size
FROM image
WHERE img_major_mime = 'image' AND img_minor_mime = 'gif'
AND img_media_type = 'BITMAP'
AND img_timestamp > %s and img_timestamp < %s
""", ( cut2.strftime( "%Y%m%d%H%M%S" ), cut1.strftime( "%Y%m%d%H%M%S" ) ) )
print "fetching results..."
data = cursor.fetchall()
fields = cursor.description
cursor.close()
connection.close()
#
# get potential images from taglist
#
print "processing results..."
for name, img_width, img_height, img_size in data:
print name
pixels = img_width * img_height
if pixels == 0 :
print "0 pixels"
continue
if img_size / pixels < 0.005 and name.lower().find( 'anim' ) == -1 :
print "bytes per pixel too low, unlikely candidate"
continue
if taglist.has_key( name) and taglist[ name ] :
continue
decomposed_string = unicodedata.normalize( 'NFD', name.decode('utf-8') )
#page = wikipedia.Page(site, 'Image:' + decomposed_string.encode('utf-8') )
page = wikipedia.Page(site, 'Image:' + name.decode('utf-8') )
text = ""
if page.exists() :
text = page.get(get_redirect=True)
# already contains the Animated GIF category
if string.find(text, '[[Category:Animated GIF' ) >= 0 :
print "Already tagged!"
continue
bs = 8000
of = -bs - 1
nof = 0
isAnimated = False
reading = True
# 0 header
# 1 block
# 2 data
level = 0
frames = 0
#if img_width > 300 :
# m = hashlib.md5()
# m.update( name )
# h = m.hexdigest()
# url = "http://upload.wikimedia.org/wikipedia/commons/thumb/%s/%s/%s/120px-%s" % ( h[0], h[0:2], urllib2.quote(name), urllib2.quote(name) )
#else :
# url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)
url = "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib2.quote(name)
print "downloading %s ..." % url
try:
while reading :
if nof - of >= bs - 10 :
of = nof
headers = { 'User-Agent' : useragent, 'Range' : 'bytes=%d-%d' % ( of, of+bs ) }
req = urllib2.Request( url, '', headers )
f = urllib2.urlopen(req)
print "Downloading"
t = f.read();
a = array.array('B')
a.fromstring(t)
f.close()
if level == 0 and of > 0 :
print "Mayday, no header info read!"
break
if level == 0 :
# analyze header
if t[:6] != 'GIF89a' and t[:6] != 'GIF87a' :
print "Not a GIF image"
break
# global color table flag
gctf = ( a[10] & 128 ) >> 7
print gctf
# colors
colors = 2 << ( a[10] & 7 )
nof = 13 + 3 * gctf * colors
print nof, a[nof]
level = 1
continue
if level == 1 :
level = 2
if a[nof-of] == 0x21 :
print "Found Extension block!", a[nof-of+1]
nof += 2
continue
if a[nof-of] == 0x2c :
print "Found Image block!"
frames += 1
if frames > 1 :
isAnimated = True
break
lctf = ( a[nof-of+8] & 128 ) >> 7
colors = 2 << ( a[nof-of+8] & 7 )
nof += 11 + 3 * lctf * colors
continue
if a[nof-of] == 0x3b :
break
print "Unknown Block found!", a[nof-of]
break
if level == 2 :
# block terminator
if a[nof-of] == 0 :
level = 1
# print " block size:", a[nof-of]
nof += a[nof-of] + 1
except Exception, e:
print "Exception while downloading:", e
continue
if not isAnimated :
print "static GIF"
taglist[ name ] = True;
continue
print "YAY! tagging..."
try:
wikipedia.setAction("adding [[Category:Animated GIF files]]")
text = page.get(get_redirect=True)
text2 = text + "\n" + "[[Category:Animated GIF files]]"
page.put(text2)
taglist[ name ] = True;
file = open( "taglist.gifbot", "wb" )
marshal.dump( taglist, file )
file.close()
except Exception, e:
print "Exception while tagging:", e
file = open( "taglist.gifbot", "wb" )
marshal.dump( taglist, file )
file.close()
except MySQLdb.OperationalError, message:
errorMessage = "Error %d:\n%s" % (message[ 0 ], message[ 1 ] )
#