#!/usr/bin/perl -w
# Copyright (c) 2010 Ilmari Karonen <vyznev@toolserver.org>.
#
# Permission to use, copy, modify, and/or distribute this
# software for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use utf8;
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw'POST $DYNAMIC_FILE_UPLOAD';
$DYNAMIC_FILE_UPLOAD = 1;
use XML::Simple;
use Data::Dumper 'Dumper';
use Getopt::Long 'GetOptions';
use Term::ReadKey 'ReadMode';
use POSIX 'strftime';
use Digest::SHA1;
binmode $_, ":utf8" for \*STDIN, \*STDOUT, \*STDERR;
utf8::decode($_) for @ARGV;
my $unicode = "Üñıç∅∂\x{3F5}";
# Default options:
my $server = "commons.wikimedia.org";
my $prefix = my $suffix = "";
my $filelist;
my $mime_re;
# Usage instructions:
my $usage = <<"USAGE";
Usage: $0 [options] <file(s)>
Options:
-s, --server=<hostname>
Hostname of wiki server (default: $server).
-f, --filelist=<file>
File to read file names from.
-p, --prefix=<text>
Prefix for downloaded file names (default: none).
-u, --suffix=<text>
Suffix for downloaded file names (default: none).
--mime-regexp=<re>
Only download files with types matching this regexp (anchored).
USAGE
# '
# Parse options, print usage message if failed:
GetOptions( 'server|s=s' => \$server,
'filelist|f=s' => \$filelist,
'prefix|p=s' => \$prefix,
'suffix|u=s' => \$suffix,
'mime-regexp|mimeregexp=s' => \$mime_re,
) and ($filelist || @ARGV) or die $usage;
$mime_re = qr/^(?:$mime_re)$/ if $mime_re;
# Read extra file names:
if ($filelist) {
open LIST, '<:utf8', $filelist or die "Error opening $filelist: $!\n";
push @ARGV, <LIST>;
close LIST or die "Error closing $filelist: $!\n";
}
# Set up user agent, define subroutine for API queries:
my $ua = LWP::UserAgent->new(
agent => "Mozilla/4.0 (compatible; $0)",
from => 'vyznev@toolserver.org',
cookie_jar => {},
parse_head => 0,
);
my $apiURI = "http://$server/w/api.php";
sub apireq {
my $query = [format => 'xml', @_];
my $sleep = 5;
ref($_) or utf8::encode($_) for @$query;
while (1) {
my $res = $ua->post($apiURI, $query, Content_Type => 'form-data');
my $err = $res->header('MediaWiki-API-Error') || "";
return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag';
print STDERR "API request failed, ", ($err || $res->status_line), "...";
if ($sleep > 3*60*60) {
warn "giving up\n";
return XMLin( $res->content );
}
warn "sleeping $sleep seconds\n";
sleep $sleep;
$sleep *= 2;
}
}
# Do the uploads:
FILE: foreach my $title (@ARGV) {
s/[\s_]+/_/g, s/^_//, s/_$// for $title;
$title = ucfirst $title;
my $file = $prefix . $title . $suffix;
# warn "Loading info for $title...\n";
my $data = apireq(
maxlag => 5,
action => 'query',
prop => 'imageinfo',
iiprop => 'url|mime|size|sha1',
iilimit => 1,
titles => "File:$title",
redirects => 1,
requestid => $unicode,
);
$data->{requestid} eq $unicode
or die "Unicode round trip failed: expected \"$unicode\", got \"$data->{requestid}\".\n";
exists $data->{query}{pages}{page}{missing}
and warn "Skipping $title, file does not exist.\n" and next;
my $normalized = $data->{query}{pages}{page}{title}
or die "Failed to get normalized title, got:\n", Dumper($data), "\n";
s/^File://i, tr/ /_/ for $normalized;
if ($title ne $normalized) {
warn "$title normalized/redirected to $normalized.\n";
$title = $normalized;
$file = $prefix . $title . $suffix;
}
my $imageinfo = $data->{query}{pages}{page}{imageinfo}{ii}
or warn "Failed to get image info, got:\n", Dumper($data), "\n" and next; # what's causing this?
my $url = $imageinfo->{url}
or die "Failed to get file URL, got:\n", Dumper($data), "\n";
my $size = $imageinfo->{size}
or die "Failed to get file size, got:\n", Dumper($data), "\n";
$mime_re and $imageinfo->{mime} !~ $mime_re
and warn "Skipping $title due to bad MIME type $imageinfo->{mime}.\n" and next;
if (-e $file) {
warn "Skipping $title ($size bytes, $imageinfo->{mime}), $file already exists.\n";
}
else {
print STDERR "Downloading $title ($size bytes, $imageinfo->{mime}) to $file... ";
my $dl = $ua->get($url, ':content_file' => $file);
die "FAILED: " . $dl->status_line . "\n" unless $dl->is_success;
warn $dl->status_line . "\n";
}
-s $file == $size or die "Size mismatch: expect $size bytes, got ".(-s _).".\n";
open FILE, "<", $file or die "Error opening $file: $!\n";
binmode FILE;
my $sha1 = Digest::SHA1->new;
$sha1->addfile(\*FILE);
my $digest = lc $sha1->hexdigest;
$digest eq lc $imageinfo->{sha1} or die "SHA1 mismatch: expected $imageinfo->{sha1}, got $digest.\n";
close FILE or die "Error closing $file: $!\n";
}
__END__