User:FairuseBot/10c-removal.pl
Appearance
#!/usr/bin/perl # 10c-removal # # A bot to remove NFCC #10c-incompliant images from pages use strict; use warnings; use Date::Calc; use Data::Dumper; use libBot; my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia", "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public domain", "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation", "Work of the United States Government"); my $common_links = join "|", @common_links; my $test = 0; my $homedir = '/home/mark/Desktop/wikibots/10cbot'; my $permit_interruptions = 1; # Allow talkpage messages to stop the bot? Pearle::init("FairuseBot", "", "$homedir/removebot.log","$homedir/removebot-cookies.txt"); Pearle::config(nullOK => 1, printlevel => 4); config(username => "FairuseBot"); if(!Pearle::login()) { exit; } # Check for a running copy if(-e "$homedir/pid") { # Possible other copy. Compare PIDs open PIDFILE, "<", "$homedir/pid"; my $pid = <PIDFILE>; close PIDFILE; my $psresult = `ps -p $pid`; if($psresult =~ /10c-removal.pl/) { botwarnlog("*Previous run is taking longer than normal\n"); exit; } } open PIDFILE, ">", "$homedir/pid"; print PIDFILE $$; close PIDFILE; my $total_images = 0; my @logs; { my @images; my $image; my $images_removed = 0; @images = (); Pearle::myLog(2, "Beginning set at " . time() . "\n"); # Get the log if($test) { @images = ('Image:Dummy316.png'); } else { my $CURRENT_DIR; my @files; # Scan the directory for log files opendir($CURRENT_DIR, $homedir) or (print "Failed: $!\n" and return); @files = readdir $CURRENT_DIR or (print "Failed: $!\n" and return); closedir $CURRENT_DIR; @files = grep {/^partial_failures.*txt$/} @files; foreach my $file (@files) { my ($year, $month, $day) = $file =~ /_(\d{4})-(\d{1,2})-(\d{1,2})/; if(Date::Calc::Delta_Days( $year, $month, $day, (Date::Calc::Today(1))) > 5) { open INFILE, "<:utf8", "$homedir/$file"; my @new_images = <INFILE>; close INFILE; chomp @new_images; push @images, @new_images; push @logs, "$homedir/$file"; } } } Pearle::myLog(3, join("\n", @images)); Pearle::myLog(3, "\n" . scalar(@images) . " images found\n"); if(scalar(@images) == 0) { Pearle::myLog(1, "*No images to remove\n"); } foreach $image (@images) { my $image_url; my $image_regex = $image; my $page; my $full_comment = ""; my $removal_prefix = "Image with inadequate rationale removed:"; my $removal_comment = "Removing image with inadequate [[WP:NFCC|rationale]]"; # Fetch image info Pearle::myLog(2, "Processing image $image\n"); # Fetch the image data my $image_data; if($test) { $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], plnamespace => [0, 2], # Links rvprop => ['content'], # Article body iiprop => ['user', 'comment', 'sha1'], iilimit => 500, # Upload history meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500); # Image usage } else { $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], plnamespace => [0], # Links rvprop => ['content'], # Article body iiprop => ['user', 'comment', 'sha1'], iilimit => 500, # Upload history meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500); # Image usage } if(!defined($image_data)) { Pearle::myLog(0, "Server did not return an appropriate response. Exiting.\n"); last; } # Extract the list of pages where it's used. my @pages = GetPageList($image_data); my $num_pages = scalar(@pages); my @failed_pages; # Extract the categories my @categories = GetPageCategories($image_data); # Extract a list of pages this image links to. my @links = GetPageLinks($image_data); # Filter out common links @links = grep {$_ !~ /^($common_links)$/} @links; if($permit_interruptions and DoIHaveMessages($image_data)) { Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n"); exit; } # Sanity check: Does the image still exist? if($image_data =~ /missing=""/) { Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n"); next; } # Sanity check: Is this still tagged as non-free? if(!grep {$_ eq 'Category:All non-free media'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n"); next; } # Sanity check: Is the image used? if(scalar(@pages) == 0) { # Orphaned fairuse image Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n"); # Is this image already disputed? if(grep {$_ eq 'Category:All disputed non-free images'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n"); } else { if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories) { my $text = "\n{{subst:orfud}}\n"; wikilog($image, $text, "Non-free image is not used in any article\n"); } } next; } # Sanity check: Is the image still tagged as disputed? if(!grep {$_ eq 'Category:All disputed non-free images'} @categories) { Pearle::myLog(2, "*Image [[:$image]] is not marked for deletion.\n"); next; } # Remove the NFCC-failure tag and the list of pages # Blindly removing the tag is safe: # 1) If the program fails, 10cbot will pick the image up on its next pass # 2) If the image is orphaned, or will be orphaned by removal (unlikely), 10cbot or another bot will pick it up # 3) If the image is non-compliant on all pages, 10cbot will pick it up on the next pass my $wikipage = Pearle::getPage($image); my $text = $wikipage->getEditableText(); $text =~ s/\x03\x44i-missing article links[^\x04]*\x04//s; Pearle::myLog(4, "Text after processing:\n$text\n"); $wikipage->setEditableText($text); Pearle::postPage($wikipage, "Removing tag", 0); Pearle::limit(); # Build the image-matching regex my ($raw_image) = $image =~ /Image:(.*)/; $raw_image = MakeWikiRegex($raw_image); if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i) { $image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*"; Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n"); next; # Non-image media are too hard to work with } else { $image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*"; } # Sanity check if(!defined($raw_image) or $image !~ /$raw_image/) { botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n"); next; } Pearle::myLog(3, "Image regex: $image_regex\n"); # Check for best-case compliance: each use has a matching direct link in the body of the text - tested Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n"); Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n"); Pearle::myLog(4, "Image links to " . join("|", @links) . "\n"); foreach my $page (@links) # Filter out pages that match a link { @pages = grep {$_ ne $page} @pages; } Pearle::myLog(4, "Image failed best-case test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for liberal compliance: # For each use, remove it from the list if there's a case-insensitive match in the body text - tested foreach my $page (@pages) { my $page_match_regex = MakeWikiRegex($page); push @failed_pages, $page unless($text =~ /$page_match_regex/i); } @pages = @failed_pages; @failed_pages = (); Pearle::myLog(4, "Image failed text test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for strict compliance: # For each link, chase redirects - tested if(scalar(@links) > 0) { my $page_data = Pearle::APIQuery(titles => \@links, redirects => 1); my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data); my @redirects; Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{redirects}->{r}) and defined($parsed_xml->{query}->{redirects}->{r})) { if(ref($parsed_xml->{query}->{redirects}->{r}) eq 'ARRAY') { @redirects = @{$parsed_xml->{query}->{redirects}->{r}}; } else { @redirects = ($parsed_xml->{query}->{redirects}->{r}); } } foreach my $page (@pages) { my $matched = 0; foreach my $redirect (@redirects) { if($redirect->{to} eq $page) { # We can get there by a redirect UpdateLink($image, $redirect->{from}, $page); Pearle::limit(); $matched = 1; last; } } if(!$matched) { push @failed_pages, $page; } } @pages = @failed_pages; @failed_pages = (); } Pearle::myLog(4, "Image failed redirect test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Check for near-compliance: # For each use, if we can get to it by means of a disambiguation page, update the link - tested foreach my $page (@links) { # Fetch the page text and page links my $page_data = Pearle::APIQuery(titles => [$page], prop => ['links', 'revisions'], plnamespace => [2], # Links rvprop => ['content']); # Article body # If the page text indicates disambig, see if any of the links is one we're looking for my $page_text = GetPageText($page_data); if($page_text =~ /{{disambig}}/i) { my @page_links = GetPageLinks($page_data); foreach my $disambig_link (@page_links) { if(grep {$_ eq $disambig_link} @pages) { # It's a match. Remove it from the list @pages = grep {$_ ne $disambig_link} @pages; # Post to the page my $success = UpdateLink($image, $page, $disambig_link); if(!$success) { botwarnlog("*Failed to update disambiguation link for [[:$image]] from [[$page]] to [[$disambig_link]]\n"); } Pearle::limit(); } } } } Pearle::myLog(4, "Image failed disambiguation test for " . scalar(@pages) . " pages.\n"); next if(scalar(@pages) == 0); # Test for compliance # Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style. Leave a note on the article talk page. if(scalar(@pages) > 0 and $num_pages > scalar(@pages)) { Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n"); my $parsed_removal_comment = $removal_comment; $parsed_removal_comment =~ s/image/[[:$image|image]]/; foreach $page (@pages) { my $hits = 0; notelog("Page for removal: $page\n"); if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) # Don't limit if we just touched the article { Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n"); Pearle::limit(); } $images_removed += $hits; } } elsif(scalar(@pages) > 0) { # Fully-non-compliant. Should never occur, but if it does, let 10cbot pick it up on the next pass. Pearle::myLog(2, "Image $image failed on all pages\n"); } else { Pearle::myLog(2, "Image $image is now fully-compliant\n"); } } Pearle::myLog(2, "Finished with set. Removed $images_removed images.\n"); $total_images += $images_removed; } # Remove the processed logs unlink @logs; unlink "$homedir/pid"