#!/usr/bin/perl
use strict;
use warnings;
use LWP 5.64;
use LWP::Simple;
use utf8;
use Encode;
use Time::HiRes qw (sleep);
sub trim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
my $browser = LWP::UserAgent->new();
print "======================\n";
open( DATEI, ">bot.sh") || die "Datei nicht gefunden";
print DATEI "#!/bin/bash\n";
chmod (0777,"bot.sh");
close( DATEI );
open( ERRLOG, ">errorlog.txt" ) || die "Errorlog creation failed";
close( ERRLOG );
open( GPNS, "gpn.txt" ) || die "Missing the GRIN numbers";
my $gpnfile_line;
while( $gpnfile_line = <GPNS> ) {
my $gpnid = trim($gpnfile_line);
my $abstract_url = "http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html";
my $remote_file = "http://dayton.hq.nasa.gov/IMAGES/LARGE/".$gpnid.".jpg";
# Downloading abstract page
print( "Looking for ".$gpnid." at ".$abstract_url."\n");
my $abstract_website = $browser->get("http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html");
if( ! $abstract_website->is_success )
{
open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed";
print ERRLOG "Failed to download abstract for ".$gpnid.". Error was: ".$abstract_website->status_line."\n";
close( ERRLOG );
}
my $seite_code = $abstract_website->content();
$seite_code =~ s/<br \/>\r?\n//gi;
Encode::from_to($seite_code, "iso-8859-1", "utf8");
# Retrieve title
my $abstract_title;
if( $seite_code =~ m/<\!--\ ONE-LINE-DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!--\ ONE-LINE-DESCRIPTION-END\ -->/m ) {
$abstract_title = trim($1);
} else {
die "Could not find title" ;
}
#print( "Title: ".$abstract_title."\n");
# Retrieve description
my $abstract_full_description;
if( $seite_code =~ m/<\!--\ DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!-- \DESCRIPTION-END/m ) {
$abstract_full_description = trim($1);
# Handle line endings and paragraphs.
$abstract_full_description =~ s/\n\n/P9@/gi;
$abstract_full_description =~ s/\n//gi;
$abstract_full_description =~ s/P9@/\n\n/gi;
$abstract_full_description =~ s/ / /gi; # double space to single space
} else {
$abstract_full_description = "[[Category:GRIN images detection errors]]"
}
#print( "Description: ".$abstract_full_description."\n");
# Retrieve author
my $abstract_creator="";
if( $seite_code =~ m/Creator\/Photographer:<\/B>([\s\S]*?)<LI>/mi ) {
$abstract_creator = trim($1);
} else {
$abstract_creator = "[[Category:GRIN images detection errors]]";
}
#print( "Creator: ".$abstract_creator."\n" );
# Retrieve reference ID info
my $centerid="";
if( $seite_code =~ m/<\!--\ OTHERNUMBER-BEGIN\ -->([\s\S]*?)<\!--\ OTHERNUMBER-END\ -->/m ) {
$centerid = trim($1);
} else {
$centerid = "[[Category:GRIN images detection errors]]";
}
#print( "CenterID: ".$centerid."\n" );
my $centershort="";
if( $seite_code =~ m/<\!--\ CENTER-BEGIN\ -->([\s\S]*?)<\!--\ CENTER-END\ -->/m ) {
$centershort = trim($1);
} else {
$centershort = "[[Category:GRIN images detection errors]]";
}
#print( "Center: ".$centershort."\n" );
if( lc($centershort) eq lc("MSFC") ) {
$centerid = "MSFC-".$centerid;
}
# Check if it's likely USGov-NASA, put in a cat in case it needs to be checked by humans.
my $permission;
if( $abstract_creator =~ m/^NA[SC]A/i ) {
$permission = "{{PD-USGov-NASA}}";
} else {
$permission = "[[Category:GRIN images requiring copyright evaluation]]";
}
# Retrieve the date
my $abstract_date;
if( $seite_code =~ m/DATE-BEGIN\ --([\s\S]*?)--\ DATE-END/m ) {
$abstract_date = trim($1);
my $yyyy = substr($abstract_date,0,4);
my $mm = substr($abstract_date,4,2);
my $dd = substr($abstract_date,6,2);
$abstract_date="{{date|".$yyyy."|".$mm."|".$dd."}}";
} else { $abstract_date = "[[Category:GRIN images detection errors]]"; }
# Check for possible dupes
my $searchquery = "http://commons.wikimedia.org/w/api.php?action=query&list=search&srwhat=text&srnamespace=6&format=xml&srsearch=".$gpnid;
my $searchresult = $browser->get( $searchquery );
my $duperesult = "";
if( $searchresult->is_success ) {
$searchresult = $searchresult->content();
if( !($searchresult =~ m/<search\ \/>/i )) {
print "possible DUPE\n";
$duperesult = "\n[[Category:GRIN possible dupes]]";
}
}
# Assemble the final description for the page
my @description = ();
$description[0] = "{{Information";
$description[1] = "|Description={{en|1=".$abstract_full_description."}}";
$description[2] = "|Source=[".$remote_file." Great Images in NASA] [".$abstract_url." Description]";
$description[3] = "|Date=".$abstract_date;
$description[4] = "|Author=".$abstract_creator;
$description[5] = "|Permission=".$permission;
$description[6] = "|other_versions=";
$description[7] = "}}";
$description[8] = "{{NASA-image|id=".$gpnid."|alternateid=".$centerid."|center=".$centershort."}}";
$description[9] = "";
$description[10] = "{{subst:unc}}";
$description[11] = "[[Category:Great Images in NASA]]".$duperesult;
my $description = join("\n",@description);
# Retrieve the image itself
my $local_file = $abstract_title." - ".$gpnid.".jpg";
# print( "Going to retrieve file ".$remote_file." and store it as ".$local_file."\n");
# my $returncode = getstore( $remote_file, $local_file);
# if( $returncode != 200 )
# {
# open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed";
# print ERRLOG "Failed to download ".$gpnid.". Error was: ".$returncode."\n";
# close( ERRLOG );
# }
# Bash normalization of the strings
$description =~ s/\"/\\\"/g;
$local_file =~ s/\"/\\\"/g;
$description =~ s/`/'/g;
$local_file =~ s/`/'/g;
# Normalizations done by the mediawiki upload
$local_file =~ s/ /_/gi;
$local_file =~ s/#/-/g;
$local_file =~ s/:/-/g;
$local_file =~ s/\//-/g;
# Write the upload command for this file to bot.sh
open( DATEI, ">>bot.sh") || die "Datei nicht gefunden";
print DATEI "python2.5 /home/multichill/pywikipedia/upload.py -keep -noverify -filename:\"".$local_file."\" \"".$remote_file."\" \"".$description."\"\n";
close( DATEI );
sleep( 0.25 );
}
close( GPNS );