From PTAGISWiki
#!/usr/bin/perl -w
# Sitescape to wiki conversion tool
# Based on the plaintext to wiki conversion tool txt2wiki.pl
# takes sitescape export files as input. Use SEFInterface getEntries to generate export files
use HTML::Entities;
my $indir = `pwd`;
$indir = "/net/reedi/global/ds1/pitweb/ptagis-1.0";
chomp $indir;
my $infile;
my $outfile;
my $title;
my $id = 9000;
my $count = 0;
my $timestamp = `date -u +"%FT%TZ"`;
chomp $timestamp;
$count = `/home/rday/bin/getwikiid.pl`;
$id = $id + $count;
opendir(INDIR, $indir) || die "couldn't open $indir for reading";
while($infile = readdir(INDIR)) {
next if ($infile =~ /^\./);
next if ($infile =~ /xml$/);
next unless ($infile =~ /txt$/);
print "found $infile...\n";
$outfile = $infile;
$outfile =~ s/txt$/xml/;
$outfile =~ s/pl$/xml/;
print "switch to $outfile\n";
die unless ($outfile =~ m/xml$/);
$title = $infile;
$title =~ s/\.txt$//;
open(IN, "$indir/$infile") || die "couldn't open $infile for read";
open(OUT, ">$outfile") || die "couldn't open $outfile for write";
&printheader();
local $/ = undef; # set input record separator to null
while (<IN>) {
$lines++;
#print "read a line from $infile\n";
#($title) = m/<eTitle>(.*?)<\/eTitle>/ unless ($title);
s/<\/?eTitle>/==/g;
s/<\/?eAbstract[^>]*?>//gi;
s/<\/?div[^>]*?>//gi;
s/<hr[^>]*?>/<hr\/>/gi;
s/<\/?meta[^>]*?>//gi;
s/<\/?st1[^>]*?>//gi;
s/<\/?font[^>]*?>//gi;
s/<\/?span[^>]*?>//gi;
s/<\/?html[^>]*?>//gi;
s/<\/?body[^>]*?>//gi;
s/<\/?img[^>]*?>//gims;
s/<\/?o:p[^>]*?>//gi;
s/<br>/<br\/>/gi;
s/<p[^>]*?>/<p>/gi;
s/<\/p>/<\/p>/gi;
s/<head>.*?<\/head>//igms;
s/<style>.*?<\/style>//igms;
s/<!--.*?-->//gms;
s/<!doctype[^>]*>//gi;
s/<\?xml[^>]*>//gims;
s/<li[^>]*?>/<li\/>/gims;
s/<\/li>//gi;
s/<ul[^>]*?>/<ul>/gims;
s/<\/ul>/<\/ul>/gi;
s/ //g;
print OUT encode_entities($_);
#print OUT;
}
print "scanned $lines lines\n";
&printfooter();
close(IN);
close(OUT);
print "converted $outfile...\n";
$id++;
}
closedir(INDIR);
sub printheader {
print OUT <<EOF;
<mediawiki xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/
http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
<siteinfo>
<sitename>PTAGISWiki</sitename>
<base>http://php.ptagis.org/wiki/index.php/Main_Page</base>
<generator>MediaWiki 1.5.6</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2">Media</namespace>
<namespace key="-1">Special</namespace>
<namespace key="0"></namespace>
<namespace key="1">Talk</namespace>
<namespace key="2">User</namespace>
<namespace key="3">User talk</namespace>
<namespace key="4">Wiki</namespace>
<namespace key="5">Wiki talk</namespace>
<namespace key="6">Image</namespace>
<namespace key="7">Image talk</namespace>
<namespace key="8">MediaWiki</namespace>
<namespace key="9">MediaWiki talk</namespace>
<namespace key="10">Template</namespace>
<namespace key="11">Template talk</namespace>
<namespace key="12">Help</namespace>
<namespace key="13">Help talk</namespace>
<namespace key="14">Category</namespace>
<namespace key="15">Category talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>$title</title>
<id>$id</id>
<revision>
<id>$id</id>
<timestamp>$timestamp</timestamp>
<contributor><username>Root</username><id>1</id></contributor>
<text space="preserve">
[[Category:Sites]]
EOF
}
sub printfooter {
print OUT <<EOF;
</text>
</revision>
</page>
</mediawiki>
EOF
}