Viquipèdia:Enllaços incorrectes a pàgines de desambiguació/Scripts
Aparença
#! /usr/bin/perl
use strict;
my %interesting=
('' => {
name => 'article',
filename => 'articles.txt',
cutoff => 1},
'Plantilla' => {
name => 'plantilla',
filename => 'templates.txt',
cutoff => 0,
list => 1});
my $exp_re=qr/\(desambiguació\)$/;
my @templates=split(/\n/,<<__EOT__);
Acrònim
Biografies
Desambiguació
Desambiguació 2
DesambigCurta
__EOT__
foreach my $template (@templates) {
$template =~ s/^([[:alpha:]])/[$1\L$1]/;
}
my $tmpl_re=join('|',sort({$b cmp $a} @templates));
my $dab_re=qr/{{(?i:msg:)?\s*(?i:plantilla\s*:\s*)?($tmpl_re)\s*(?i:\||}})/;
my($ns_re,%ns_canon);
my $want_progress=@ARGV>0 && $ARGV[0] eq '-p';
my $last_progress=-1;
sub pageloop (&)
{
my($handler)=@_;
my($size);
local $/="</page>\x0A";
$size=-s PAGES;
while (defined(my $page=<PAGES>)) {
my($nstitle,$ns,$title);
$page =~ /^\s*<page>/ or last;
($nstitle)=($page =~ m{<title>([^<]+)</title>})
or die "Impossible de trouver le titre de la page";
if ($nstitle =~ /^($ns_re):(.+)$/) {
$ns=$1;
$title=$2;
} else {
$ns='';
$title=$nstitle;
}
$page =~ m{</text>} or next;
substr($page,$-[0])='';
$page =~ /<text xml:space="preserve">/
or die "Impossible de trouver le début du texte pour la page $nstitle";
substr($page,0,$+[0])='';
$handler->($nstitle,$ns,$title,$page);
if ($want_progress) {
my $progress=int(tell(PAGES)/$size*1000);
if ($progress!=$last_progress) {
$last_progress=$progress;
printf STDERR "\r0.%.3u",$progress;
}
}
}
if ($want_progress) {
print STDERR "\r";
}
}
sub mungtarget ($$$ )
{
my(undef,$source,$sub)=@_;
for my $target ($_[0]) {
$target =~ tr/\t\n\r/ /;
$target =~ s/^ +//;
$target =~ s/ +$//;
$target =~ s/ {2,}/ /g;
if ($sub && $target =~ m{^/}) {
$target=$source.$target;
} elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) {
$target=$2;
utf8::decode($target);
$target=ucfirst($target);
utf8::encode($target);
$target=$ns_canon{lc($1)}.":".$target;
} elsif ($target =~ /^:*(.+)$/i) {
$target=$1;
utf8::decode($target);
$target=ucfirst($target);
utf8::encode($target);
} else {
# a malformed link, usually empty brackets
}
}
}
my(%dab,%redir,@circular);
sub pass1 ()
{
print STDERR "Analyse : 1er passage\n";
{
my($siteinfo,@namespaces);
local $/="</siteinfo>\x0A";
$siteinfo=<PAGES>;
@namespaces=
$siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g;
$ns_re=join('|',map(quotemeta($_),sort({$b cmp $a} @namespaces)));
foreach my $ns (@namespaces) {
$ns_canon{lc($ns)}=$ns;
}
}
pageloop {
my($nstitle,$ns,$title)=splice(@_,0,3);
for my $text ($_[0]) {
my $sub=$interesting{$ns}->{subpages};
if ($ns eq '' && $text =~ $dab_re) {
$dab{$nstitle}=1;
}
if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) {
my($target,$back);
$target=$1;
mungtarget($target,$nstitle,$sub);
while ($target ne $nstitle) {
my($newtarget);
$newtarget=$redir{$target};
last unless defined($newtarget);
$target=$newtarget;
}
if ($target eq $nstitle) {
push(@circular,$nstitle);
} else {
$redir{$nstitle}=$target;
}
}
}
};
foreach my $target (keys(%redir)) {
my(@chain);
for (;;) {
my $newtarget=$redir{$target};
last unless defined($newtarget);
push(@chain,$target);
$target=$newtarget;
}
pop(@chain);
foreach my $source (@chain) {
$redir{$source}=$target;
}
}
print STDERR " ".keys(%dab)." pages d'homonymie\n";
print STDERR "\n";
}
my %stats=map {
($_,{});
} keys(%interesting);
my %lists=map {
($_,{});
} grep {
$interesting{$_}->{list};
} keys(%interesting);
sub pass2 ()
{
my(%linked);
print STDERR "Analyse : 2me passage\n";
{
local $/="</siteinfo>\x0A";
<PAGES>;
}
pageloop {
my($nstitle,$ns,$title)=splice(@_,0,3);
for my $text ($_[0]) {
my($stats,$lists,$sub);
$stats=$stats{$ns};
$lists=$lists{$ns};
$sub=$interesting{$ns}->{subpages};
if ($stats) {
my(%seen);
while ($text =~ /\[\[([^\]\|]+)/g) {
my($target,$final);
$target=$1;
mungtarget($target,$nstitle,$sub);
next if $target =~ $exp_re;
$final=$redir{$target};
$final=$target unless defined($final);
if ($dab{$final} && !$seen{$final}++) {
$linked{$final}=1;
$stats->{$final}++;
if ($lists) {
push(@{$lists->{$final}},$nstitle);
}
}
}
}
}
};
print STDERR " ".keys(%linked)." liens vers les pages d'homonymie\n";
foreach my $ns (sort(keys(%stats))) {
print STDERR (" ".keys(%{$stats{$ns}})." dans l'espace de nom ".
$interesting{$ns}->{name}."\n");
}
print STDERR "\n";
}
sub wikilink ($ )
{
my($target)=@_;
if (exists($redir{$target})) {
"[{{SERVER}}{{localurl:$target|redirect=no}} $target]";
} elsif ($target =~ m{/\.{1,2}(?:$|/)}) {
"[{{SERVER}}{{localurl:$target}} $target]";
} elsif ($target =~ m{^/}) {
"[[:$target]]";
} else {
"[[$target]]";
}
}
sub report ()
{
print STDERR "Génération du rapport\n";
foreach my $target (@circular) {
$redir{$target}=$target;
}
while (my($ns,$stats)=each(%stats)) {
my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)};
my $lists=$lists{$ns};
my @nstitles=sort {
$stats->{$b}<=>$stats->{$a} || $a cmp $b;
} grep {
$stats->{$_}>=$cutoff;
} keys(%{$stats});
my $total=0;
open(REPORT,'>',$filename)
or die "Impossible de créer $filename: $!";
binmode(REPORT);
print REPORT "\xEF\xBB\xBF";
foreach my $nstitle (@nstitles) {
$total+=$stats->{$nstitle};
}
print REPORT "Nombre total de liens : $total\n";
foreach my $nstitle (@nstitles) {
print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle},
" [[Especial:Whatlinkshere/",$nstitle,"|liens]]\n");
if ($lists) {
foreach my $source (sort(@{$lists->{$nstitle}})) {
print REPORT "#* ",wikilink($source),"\n";
}
}
}
close(REPORT);
print STDERR " ".@nstitles." entrées ajoutées à $filename\n";
}
if (@circular) {
@circular=sort(@circular);
open(REPORT,'>','circular.txt')
or die "Impossible de créer circular.txt: $!";
binmode(REPORT);
print REPORT "\xEF\xBB\xBF";
foreach my $target (@circular) {
print REPORT "* ",wikilink($target),"\n";
}
close(REPORT);
print STDERR " ".@circular." entrées ajoutées à circular.txt\n";
} else {
unlink('circular.txt');
}
}
open(PAGES,'<','cawiki-latest-pages-articles.xml')
or die "Impossible d'ouvrir cawiki-latest-pages-articles.xml: $!";
binmode(PAGES);
pass1();
seek(PAGES,0,0);
pass2();
close(PAGES);
report();