Ss2wiki.pl

From PTAGISWiki

Jump to: navigation, search

#!/usr/bin/perl -w
# Sitescape to wiki conversion tool
# Based on the plaintext to wiki conversion tool txt2wiki.pl
# takes sitescape export files as input.  Use SEFInterface getEntries to generate export files


use HTML::Entities;

my $indir = `pwd`;
$indir = "/net/reedi/global/ds1/pitweb/ptagis-1.0";
chomp $indir;
my $infile;
my $outfile;
my $title;
my $id = 9000;
my $count = 0;
my $timestamp = `date -u +"%FT%TZ"`;
chomp $timestamp;

$count = `/home/rday/bin/getwikiid.pl`;
$id = $id + $count;

opendir(INDIR, $indir) || die "couldn't open $indir for reading";

while($infile = readdir(INDIR)) {
        next if ($infile =~ /^\./);
        next if ($infile =~ /xml$/);
        next unless ($infile =~ /txt$/);
        print "found $infile...\n";
        $outfile = $infile;
        $outfile =~ s/txt$/xml/;
        $outfile =~ s/pl$/xml/;
        print "switch to $outfile\n";
        die unless ($outfile =~ m/xml$/);
        $title = $infile;
        $title =~ s/\.txt$//;

        open(IN, "$indir/$infile") || die "couldn't open $infile for read";
        open(OUT, ">$outfile") || die "couldn't open $outfile for write";
        &printheader();
        local $/ = undef; # set input record separator to null
        while (<IN>) {
                $lines++;
                #print "read a line from $infile\n";
                #($title) = m/<eTitle>(.*?)<\/eTitle>/ unless ($title);
                s/<\/?eTitle>/==/g;
                s/<\/?eAbstract[^>]*?>//gi;
                s/<\/?div[^>]*?>//gi;
                s/<hr[^>]*?>/<hr\/>/gi;
                s/<\/?meta[^>]*?>//gi;
                s/<\/?st1[^>]*?>//gi;
                s/<\/?font[^>]*?>//gi;
                s/<\/?span[^>]*?>//gi;
                s/<\/?html[^>]*?>//gi;
                s/<\/?body[^>]*?>//gi;
                s/<\/?img[^>]*?>//gims;
                s/<\/?o:p[^>]*?>//gi;
                s/<br>/<br\/>/gi;
                s/<p[^>]*?>/<p>/gi;
                s/<\/p>/<\/p>/gi;
                s/<head>.*?<\/head>//igms;
                s/<style>.*?<\/style>//igms;
                s/<!--.*?-->//gms;
                s/<!doctype[^>]*>//gi;
                s/<\?xml[^>]*>//gims;
                s/<li[^>]*?>/<li\/>/gims;
                s/<\/li>//gi;
                s/<ul[^>]*?>/<ul>/gims;
                s/<\/ul>/<\/ul>/gi;
                s/ //g;
                print OUT encode_entities($_);
                #print OUT;
        }
        print "scanned $lines lines\n";
        &printfooter();
        close(IN);
        close(OUT);
        print "converted $outfile...\n";
        $id++;
}
closedir(INDIR);




sub printheader {
        print OUT <<EOF;
<mediawiki xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/
http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
<siteinfo>
  <sitename>PTAGISWiki</sitename>
  <base>http://php.ptagis.org/wiki/index.php/Main_Page</base>
  <generator>MediaWiki 1.5.6</generator>
  <case>first-letter</case>
  <namespaces>
    <namespace key="-2">Media</namespace>
    <namespace key="-1">Special</namespace>
    <namespace key="0"></namespace>
    <namespace key="1">Talk</namespace>
    <namespace key="2">User</namespace>
    <namespace key="3">User talk</namespace>
    <namespace key="4">Wiki</namespace>
    <namespace key="5">Wiki talk</namespace>
    <namespace key="6">Image</namespace>
    <namespace key="7">Image talk</namespace>
    <namespace key="8">MediaWiki</namespace>
    <namespace key="9">MediaWiki talk</namespace>
    <namespace key="10">Template</namespace>
    <namespace key="11">Template talk</namespace>
    <namespace key="12">Help</namespace>
    <namespace key="13">Help talk</namespace>
    <namespace key="14">Category</namespace>
    <namespace key="15">Category talk</namespace>
  </namespaces>
</siteinfo>
<page>
  <title>$title</title>
  <id>$id</id>
    <revision>
      <id>$id</id>
      <timestamp>$timestamp</timestamp>
      <contributor><username>Root</username><id>1</id></contributor>
      <text space="preserve">
                [[Category:Sites]]
EOF
}

sub printfooter {
        print OUT <<EOF;
</text>
</revision>
</page>
</mediawiki>
EOF
}

Personal tools