User:Ilmari Karonen/mwdownload.pl

#!/usr/bin/perl -w

# Copyright (c) 2010 Ilmari Karonen <vyznev@toolserver.org>.
#
# Permission to use, copy, modify, and/or distribute this
# software for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

use utf8;
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw'POST $DYNAMIC_FILE_UPLOAD';
$DYNAMIC_FILE_UPLOAD = 1;
use XML::Simple;
use Data::Dumper 'Dumper';
use Getopt::Long 'GetOptions';
use Term::ReadKey 'ReadMode';
use POSIX 'strftime';
use Digest::SHA1;

binmode $_, ":utf8" for \*STDIN, \*STDOUT, \*STDERR;
utf8::decode($_) for @ARGV;

my $unicode = "Üñıç∅∂\x{3F5}";

# Default options:

my $server = "commons.wikimedia.org";
my $prefix = my $suffix = "";
my $filelist;
my $mime_re;

# Usage instructions:

my $usage = <<"USAGE";
Usage: $0 [options] <file(s)>
Options:
    -s, --server=<hostname>
        Hostname of wiki server (default: $server).
    -f, --filelist=<file>
        File to read file names from.
    -p, --prefix=<text>
        Prefix for downloaded file names (default: none).
    -u, --suffix=<text>
        Suffix for downloaded file names (default: none).
    --mime-regexp=<re>
        Only download files with types matching this regexp (anchored).
USAGE
# '

# Parse options, print usage message if failed:

GetOptions( 'server|s=s' => \$server,
            'filelist|f=s' => \$filelist,
            'prefix|p=s' => \$prefix,
            'suffix|u=s' => \$suffix,
	    'mime-regexp|mimeregexp=s' => \$mime_re,
          ) and ($filelist || @ARGV) or die $usage;

$mime_re = qr/^(?:$mime_re)$/ if $mime_re;

# Read extra file names:

if ($filelist) {
    open LIST, '<:utf8', $filelist or die "Error opening $filelist: $!\n";
    push @ARGV, <LIST>;
    close LIST or die "Error closing $filelist: $!\n";
}

# Set up user agent, define subroutine for API queries:

my $ua = LWP::UserAgent->new(
                             agent => "Mozilla/4.0 (compatible; $0)",
                             from => 'vyznev@toolserver.org',
                             cookie_jar => {},
                             parse_head => 0,
                            );

my $apiURI = "http://$server/w/api.php";

sub apireq {
    my $query = [format => 'xml', @_];
    my $sleep = 5;
    ref($_) or utf8::encode($_) for @$query;
    while (1) {
        my $res = $ua->post($apiURI, $query, Content_Type => 'form-data');
        my $err = $res->header('MediaWiki-API-Error') || "";

        return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag';

        print STDERR "API request failed, ", ($err || $res->status_line), "...";
        if ($sleep > 3*60*60) {
            warn "giving up\n";
            return XMLin( $res->content );
        }
        warn "sleeping $sleep seconds\n";
        sleep $sleep;
        $sleep *= 2;
    }
}

# Do the uploads:

FILE: foreach my $title (@ARGV) {
    s/[\s_]+/_/g, s/^_//, s/_$// for $title;
    $title = ucfirst $title;

    my $file = $prefix . $title . $suffix;

    # warn "Loading info for $title...\n";

    my $data = apireq(
		      maxlag => 5,
		      action => 'query',
		      prop => 'imageinfo',
		      iiprop => 'url|mime|size|sha1',
		      iilimit => 1,
		      titles => "File:$title",
		      redirects => 1,
		      requestid => $unicode,
		     );

    $data->{requestid} eq $unicode
	or die "Unicode round trip failed: expected \"$unicode\", got \"$data->{requestid}\".\n";
    exists $data->{query}{pages}{page}{missing}
	and warn "Skipping $title, file does not exist.\n" and next;
    my $normalized = $data->{query}{pages}{page}{title}
	or die "Failed to get normalized title, got:\n", Dumper($data), "\n";
    s/^File://i, tr/ /_/ for $normalized;

    if ($title ne $normalized) {
	warn "$title normalized/redirected to $normalized.\n";
	$title = $normalized;
	$file = $prefix . $title . $suffix;
    }

    my $imageinfo = $data->{query}{pages}{page}{imageinfo}{ii}
	or warn "Failed to get image info, got:\n", Dumper($data), "\n" and next;  # what's causing this?
    my $url = $imageinfo->{url}
	or die "Failed to get file URL, got:\n", Dumper($data), "\n";
    my $size = $imageinfo->{size}
	or die "Failed to get file size, got:\n", Dumper($data), "\n";
    $mime_re and $imageinfo->{mime} !~ $mime_re
	and warn "Skipping $title due to bad MIME type $imageinfo->{mime}.\n" and next;

    if (-e $file) {
	warn "Skipping $title ($size bytes, $imageinfo->{mime}), $file already exists.\n";
    }
    else {
	print STDERR "Downloading $title ($size bytes, $imageinfo->{mime}) to $file... ";

	my $dl = $ua->get($url, ':content_file' => $file);
	die "FAILED: " . $dl->status_line . "\n" unless $dl->is_success;
	warn $dl->status_line . "\n";
    }

    -s $file == $size or die "Size mismatch: expect $size bytes, got ".(-s _).".\n";

    open FILE, "<", $file or die "Error opening $file: $!\n";
    binmode FILE;
    my $sha1 = Digest::SHA1->new;
    $sha1->addfile(\*FILE);
    my $digest = lc $sha1->hexdigest;
    $digest eq lc $imageinfo->{sha1} or die "SHA1 mismatch: expected $imageinfo->{sha1}, got $digest.\n";
    close FILE or die "Error closing $file: $!\n";
}

__END__