User:TheDJ/grin importer.pl

#!/usr/bin/perl
 
use strict;
use warnings;
use LWP 5.64;
use LWP::Simple;
use utf8;
use Encode;
use Time::HiRes qw (sleep);

sub trim($)
{
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}

my $browser = LWP::UserAgent->new();

print "======================\n";

open( DATEI, ">bot.sh") || die "Datei nicht gefunden";
print DATEI  "#!/bin/bash\n";
chmod (0777,"bot.sh");
close( DATEI );

open( ERRLOG, ">errorlog.txt" ) || die "Errorlog creation failed";
close( ERRLOG );

open( GPNS, "gpn.txt" ) || die "Missing the GRIN numbers";

my $gpnfile_line;
while( $gpnfile_line = <GPNS> ) {
  my $gpnid = trim($gpnfile_line);
  my $abstract_url = "http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html";
  my $remote_file = "http://dayton.hq.nasa.gov/IMAGES/LARGE/".$gpnid.".jpg";

  # Downloading abstract page
  print( "Looking for ".$gpnid." at ".$abstract_url."\n");
  my $abstract_website = $browser->get("http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html");
  if( ! $abstract_website->is_success )
  {
    open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed";
    print ERRLOG "Failed to download abstract for ".$gpnid.". Error was: ".$abstract_website->status_line."\n";
    close( ERRLOG );
  }

  my $seite_code = $abstract_website->content();
  $seite_code =~ s/<br \/>\r?\n//gi;
  Encode::from_to($seite_code, "iso-8859-1", "utf8");

  # Retrieve title
  my $abstract_title;
  if( $seite_code =~ m/<\!--\ ONE-LINE-DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!--\ ONE-LINE-DESCRIPTION-END\ -->/m ) {
    $abstract_title = trim($1);
  } else {
    die "Could not find title" ;
  }
  #print( "Title: ".$abstract_title."\n");

  # Retrieve description
  my $abstract_full_description;
  if( $seite_code =~ m/<\!--\ DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!-- \DESCRIPTION-END/m ) {
    $abstract_full_description = trim($1);
    # Handle line endings and paragraphs.
    $abstract_full_description =~ s/\n\n/P9@/gi;
    $abstract_full_description =~ s/\n//gi;
    $abstract_full_description =~ s/P9@/\n\n/gi;
    $abstract_full_description =~ s/  / /gi; # double space to single space
  } else {
    $abstract_full_description = "[[Category:GRIN images detection errors]]"
  }
  #print( "Description: ".$abstract_full_description."\n");

  # Retrieve author
  my $abstract_creator="";
  if( $seite_code =~ m/Creator\/Photographer:<\/B>([\s\S]*?)<LI>/mi ) {
    $abstract_creator = trim($1);
  } else {
    $abstract_creator = "[[Category:GRIN images detection errors]]";
  }
  #print( "Creator: ".$abstract_creator."\n" );

  # Retrieve reference ID info
  my $centerid="";
  if( $seite_code =~ m/<\!--\ OTHERNUMBER-BEGIN\ -->([\s\S]*?)<\!--\ OTHERNUMBER-END\ -->/m ) {
    $centerid = trim($1);
  } else {
    $centerid = "[[Category:GRIN images detection errors]]";
  }
  #print( "CenterID: ".$centerid."\n" );

  my $centershort="";
  if( $seite_code =~ m/<\!--\ CENTER-BEGIN\ -->([\s\S]*?)<\!--\ CENTER-END\ -->/m ) {
    $centershort = trim($1);
  } else {
    $centershort = "[[Category:GRIN images detection errors]]";
  }
  #print( "Center: ".$centershort."\n" );

  if( lc($centershort) eq lc("MSFC") ) {
    $centerid = "MSFC-".$centerid;
  }

  # Check if it's likely USGov-NASA, put in a cat in case it needs to be checked by humans.
  my $permission;
  if( $abstract_creator =~ m/^NA[SC]A/i ) {
    $permission = "{{PD-USGov-NASA}}";
  } else {
    $permission = "[[Category:GRIN images requiring copyright evaluation]]";
  }

  # Retrieve the date
  my $abstract_date;
  if( $seite_code =~ m/DATE-BEGIN\ --([\s\S]*?)--\ DATE-END/m ) {
    $abstract_date = trim($1);
    my $yyyy = substr($abstract_date,0,4);
    my $mm = substr($abstract_date,4,2);
    my $dd = substr($abstract_date,6,2);
    $abstract_date="{{date|".$yyyy."|".$mm."|".$dd."}}";
  } else { $abstract_date = "[[Category:GRIN images detection errors]]"; }

  # Check for possible dupes
  my $searchquery = "http://commons.wikimedia.org/w/api.php?action=query&list=search&srwhat=text&srnamespace=6&format=xml&srsearch=".$gpnid;
  my $searchresult = $browser->get( $searchquery );
  my $duperesult = "";
  if( $searchresult->is_success ) {
    $searchresult = $searchresult->content();
    if( !($searchresult =~ m/<search\ \/>/i )) {
      print "possible DUPE\n";
      $duperesult = "\n[[Category:GRIN possible dupes]]";
    }
  }

  # Assemble the final description for the page
  my @description = ();
  $description[0] = "{{Information";
  $description[1] = "|Description={{en|1=".$abstract_full_description."}}";
  $description[2] = "|Source=[".$remote_file." Great Images in NASA] [".$abstract_url." Description]";
  $description[3] = "|Date=".$abstract_date;
  $description[4] = "|Author=".$abstract_creator;
  $description[5] = "|Permission=".$permission;
  $description[6] = "|other_versions=";
  $description[7] = "}}";
  $description[8] = "{{NASA-image|id=".$gpnid."|alternateid=".$centerid."|center=".$centershort."}}";
  $description[9] = "";
  $description[10] = "{{subst:unc}}";
  $description[11] = "[[Category:Great Images in NASA]]".$duperesult;
  my $description = join("\n",@description);

  # Retrieve the image itself
  my $local_file = $abstract_title." - ".$gpnid.".jpg";
#  print( "Going to retrieve file ".$remote_file." and store it as ".$local_file."\n");
#  my $returncode = getstore( $remote_file, $local_file);
#  if( $returncode != 200 )
#  {
#    open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed";
#    print ERRLOG "Failed to download ".$gpnid.". Error was: ".$returncode."\n";
#    close( ERRLOG );
#  }

  # Bash normalization of the strings
  $description =~ s/\"/\\\"/g;
  $local_file =~ s/\"/\\\"/g;
  $description =~ s/`/'/g;
  $local_file =~ s/`/'/g;
  
  # Normalizations done by the mediawiki upload
  $local_file =~ s/ /_/gi;
  $local_file =~ s/#/-/g;
  $local_file =~ s/:/-/g;
  $local_file =~ s/\//-/g;

  # Write the upload command for this file to bot.sh
  open( DATEI, ">>bot.sh") || die "Datei nicht gefunden";
  print DATEI  "python2.5 /home/multichill/pywikipedia/upload.py -keep -noverify -filename:\"".$local_file."\" \"".$remote_file."\" \"".$description."\"\n";
  close( DATEI );
  sleep( 0.25 );
}

close( GPNS );