This page is likely outdated (last edited on 02 Jul 2008). Visit the new documentation for updated content.
Irc2Wiki
A mutiliated version of Jamie Zawinksi’s irc2html.pl that creates basic MediaWiki tables instead of HTML. Used by the Accessibility Team to post IRC meeting logs. Works great with irssi log format; can’t speak for anything else.
#!/usr/bin/perl -w
# irc2wiki.pl --- Hacked up irc2html.pl that spits plain MediaWiki tables
# Sandy Armstrong <sanfordarmstrong@gmail.com>
# irc2html.pl --- converts raw chat logs to a readable HTML table-ized form.
# Copyright © 1998-2006 Jamie Zawinski <jwz@jwz.org>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation. No representations are made about the suitability of this
# software for any purpose. It is provided "as is" without express or
# implied warranty.
#
# Created: 11-Jun-98.
#
# usage: irc2html.pl [--verbose] [--strip] [--no-color] infile outfile
# Or, just install it as a CGI on your web server.
#
# The format it expects from the logs is the format written by, e.g., tkirc:
# each line has a \t on it, and the part before the tab is the person talking,
# or the action happening ("[ join ]" and so on) and the part after the tab
# is the actual text.
#
# A number of other formats are accepted as well, e.g.,
#
# [17:01:17] <user> blah blah
# and
# 13:25 [xxx(~xxx@00.11.222.333)] blah blah
# 13:25 [msg(xxx)] blah blah
#
# If the line doesn't match any of the known patterns, then we assume that
# the stuff before the first space is the action and the rest is the text.
#
# The lines alternate white and gray, changing each time the person talking
# changes (so multiple lines by the same person are colored the same.)
# Or, if --no-color is specified, the talkers are separated by solid
# horizontal lines.
#
# If a line ends in something that looks like a timestamp, that goes in
# column 3.
#
# The --strip option means to strip out non-talking lines, like "[ join ]".
#
# URLs are made clickable.
#
# Color codes, bold, etc. are also supported, thanks to
# Andy Brezinsky <andy@mbrez.com> and Juerd <juerd@juerd.nl>.
#
#
# This script also works as a CGI: just make a symlink (e.g., irc2html.cgi)
# and it will present you with a text area where you can paste your char log.
# When you submit, it will return the HTML document. This might save some
# time messing around with temporary files.
#
# 5-May-2006: jwz: updated to use CSS instead of <FONT> tags.
require 5;
use diagnostics;
use strict;
my $progname = $0; $progname =~ s@^.*/([^/]+)$@$1@;
my $version = q{ $Revision: 1.23 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
my $verbose = 0;
my $strip_seconds = 1; # whether to keep timestamps in hh:mm only.
my $strip_timestamps = 0; # whether to omit any third column entirely.
my $strip_noise = 0; # whether to strip "join", "leave", etc.
my $colorize_p = 1; # whether to do background colors.
my $separators_p = 0; # whether to do horizontal lines.
my $chunk_size = 200; # start a new table after this many lines.
my $url_re = q{\b(s?https?|ftp|file|gopher|s?news|telnet|mailbox):} .
q{(//[-A-Z0-9_.]+:\d*)?} .
q{[-A-Z0-9_=?\#\$\@~\`%&*+|\/.,;\240]+};
my $fg_color = "#000";
my $bg_color1 = "#FFF";
my $bg_color2 = "#EEE";
$ENV{USER} = ($ENV{REMOTE_ADDR} || '?') unless defined ($ENV{USER});
my $doc_head =
(
""
);
my $stylesheet =
("<!-- This style sheet can be omitted if you don't care\n" .
" about the colored text that was used below. -->\n" .
"<STYLE TYPE=\"text/css\">\n" .
" <!--\n" .
" .fc0 { color:white } .bc0 { background:white }\n" .
" .fc1 { color:black } .bc1 { background:black }\n" .
" .fc2 { color:navy } .bc2 { background:navy }\n" .
" .fc3 { color:green } .bc3 { background:green }\n" .
" .fc4 { color:red } .bc4 { background:red }\n" .
" .fc5 { color:darkred } .bc5 { background:darkred }\n" .
" .fc6 { color:purple } .bc6 { background:purple }\n" .
" .fc7 { color:orange } .bc7 { background:orange }\n" .
" .fc8 { color:yellow } .bc8 { background:yellow }\n" .
" .fc9 { color:lime } .bc9 { background:lime }\n" .
" .fc10 { color:aqua } .bc10 { background:aqua }\n" .
" .fc11 { color:C0C0FF } .bc11 { background:C0C0FF }\n" .
" .fc12 { color:blue } .bc12 { background:blue }\n" .
" .fc13 { color:magenta } .bc13 { background:magenta }\n" .
" .fc14 { color:gray } .bc14 { background:gray }\n" .
" .fc15 { color:silver } .bc15 { background:silver }\n" .
# " .fixed {font-family: fixedsys, courier new, courier, fixed;\n" .
# " font-size: xx-small; }\n" .
" -->\n" .
"</STYLE>\n" .
"\n");
my $margin_style = " STYLE=\"border-top: 1px solid;\""; # used for separators
my $input_lines = 0;
my $output_lines = 0;
sub convert_line_body($) {
my ($line) = @_;
# don't allow any single word to be longer than N chars -- insert spaces.
# (actually, insert non-breaking spaces (240) which we convert later.)
1 while ($line =~ s/^(.*?)([^ \t\n\240]{40})([^ \t\n\240].*)$/$1$2\240$3/s);
$line =~ s/&/&/g;
$line =~ s/</</g;
$line =~ s/>/>/g;
# $line =~ s/\"/"/g;
# make urls clickable.
#$line =~ s@($url_re)@<A HREF="$1">$1</A>@gi;
# if the url appeared to end in punctuation, back up over it.
#$line =~ s@([-.,!;?\#]+)(\">)@$2@gs;
#$line =~ s@([-.,!;?\#]+)(</A>)@$2$1@gs;
# if we inserted any non-breaking spaces inside URLs, just nuke them.
#1 while ($line =~ s@(<A HREF=\"[^\"\240]*)\240([^\"]*\">)@$1$2@gi);
# convert remaining non-breaking spaces to real spaces.
$line =~ s/\240/ /g;
#if ($verbose) {
# $_ = $line;
# while (s@<A HREF=\"(.*?)\">@@i) {
# print STDERR "$progname: line $input_lines: URL: $1\n";
# }
#}
# if there are any ctl characters in it, do color-decoding
$line = convert_color ($line)
if ($line =~ m/[\000-\010\013\014\016-\037]/);
return $line;
}
my $prev_head = "";
my $prev_stamp = "";
my $bg_color = $bg_color1;
my $line_tick = 0;
my $any_styles = 0; # whether any red hot style sheet action is needed
sub convert_line($) {
($_) = @_;
# alternate log format...
s@^[[(](\d?\d:\d\d(:\d\d)?)[])] (.*)$@$3 {$1}@gs; # leading timestamp
s@^(\d?\d:\d\d(:\d\d)? ?([AP]M)?) (.*)$@$4 {$1}@gsi; # leading timestamp
# leading time/date stamp
s@^\[\d\d?/\d\d/\d\d \@ (\d?\d:\d\d(:\d\d)?)\]\s+(.*)$@$3 {$1}@gs;
# IRSSI leading full date log format
s@^\d\d?-...-\d\d\d\d (\d\d?:\d\d(:\d\d)?)\s+(.*)$@$3 {$1}@gs;
# GAIM inline timestamps (alone on a line with leading whitespace)
s@^ +((\d\d?:)?\d\d?:\d\d)$@*** $1@gs;
# goofy name+host format...
s@^\[msg\(([^()\[\] \t]+)\)\] @>> $1\t@gsi;
s@^\[([^()\[\] \t]+)\(([^()\[\] \t]+)\)\] @<$1>\t@gsi;
if (m@^\s*$@) {
$prev_head = "";
$bg_color = $bg_color1;
my $color_style = '';
if ($colorize_p) {
$color_style = " STYLE=\"background:$bg_color\"";
}
return "<TR><TD COLSPAN=2$color_style><BR></TD></TR>\n";
}
my ($head, $body) = m@^([^\t]+)[\t]+(.*)$@s;
my $stamp = undef;
if (!defined($head)) {
# it's best if the head and body are separated by a tab; but if we can't
# get that, try for it separated by spaces.
# if (m@^(<[^> \t\n]+>) (.*)$@s) {
if (m@^(<[\@\+ ]?[^> \t\n]+>) (.*)$@s) {
$head = $1;
$body = $2;
} elsif (m@^(\* [^> \t\n]+) (.*)$@s) {
$head = $1;
$body = $2;
} elsif (m@^(\[ *[^\]]+\]) (.*)$@s) {
$head = $1;
$body = $2;
} elsif (m@^([^\[\]<> \t\n:]+:) (.*)$@s) {
$head = $1;
$body = $2;
} elsif (m@^(>) (.*)$@s) {
$head = $1;
$body = $2;
} elsif (m@^(\*+ *[^\[\]<> \t\n:]+) (.*)$@s) {
$head = $1;
$body = $2;
} else {
$head = "";
$body = $_;
print STDERR "$progname: unparsable line $input_lines: $_\n" if $verbose;
}
}
# If there's a timestamp at the end of the line, put it in column 3.
# Otherwise, there is no column 3.
if ($body =~
m@^(.*?)\s*[({]?\s*(\d\d?:\d\d(:\d\d)?( ?[AP]M)?)\s*[)}]?\s*$@i) {
$body = $1;
$stamp = $2;
if ($strip_timestamps) {
$stamp = '';
} elsif ($strip_seconds) {
$stamp =~ s@^(\d?\d:\d\d):\d\d$@$1@;
}
if ($stamp eq $prev_stamp) {
$stamp = undef;
} else {
$prev_stamp = $stamp;
}
}
if ($strip_noise && $head =~ m@^(\[|\*\*\*)@) {
print STDERR "$progname: deleting noise line $input_lines: $head\n"
if $verbose;
return "";
}
$head = convert_line_body ($head);
$body = convert_line_body ($body);
my $continuation = 1;
my $color_changed_p = 0;
if ($head ne $prev_head) {
$prev_head = $head;
$continuation = 0;
$color_changed_p = 1;
if ($bg_color eq $bg_color1) {
$bg_color = $bg_color2;
} else {
$bg_color = $bg_color1;
}
}
$head .= " ";
if ($head =~ m@^(\[|\*\*\*)@ ||
$head =~ m@\(\s*\d\d\d+\s*\)@) {
$head = "''$head''";
$body = "''$body''";
} elsif ($continuation) {
$head = ("");
} else {
$head = "'''$head'''";
}
# Specify the foreground color in the table cells too, in case this
# html gets pasted somwehere with conflicting document colors.
#
my $color_style = '';
if ($separators_p && $color_changed_p && $output_lines != 0) {
$color_style = $margin_style;
} elsif ($colorize_p) {
$color_style = (" STYLE=\"color:$fg_color" .
($bg_color eq $bg_color1
? ''
: "; background:$bg_color") .
"\"");
}
my $result = "|-\n" .
"| $head\n" .
"| $body\n";
if ($stamp) {
$result .= "| $stamp\n";
}
if ($line_tick++ > $chunk_size) {
$line_tick = 0;
$color_style = '';
if ($bg_color ne $bg_color1) {
if ($separators_p) {
$color_style = $margin_style;
} elsif ($colorize_p) {
$color_style = " STYLE=\"background:$bg_color1\"";
}
}
}
$result =~ s@<TD VALIGN=TOP NOWRAP></TD>@<TD></TD>@g; # simplify...
$result =~ s@ NOWRAP(>[^\s]+?</TD>)@$1@g; # if no spaces, don't need nowrap
$output_lines++;
return $result;
}
sub convert_color($) {
my ($str) = @_;
$str =~ s/\e\[.*?[a-zA-Z]//g;
if ($str =~ /(<BR>)/i) {
$str =~ s//\cO$1/g;
} else {
$str .= "\cO";
}
$str =~ s/\cV(.*?)\cV/\cC0,1$1\cO/g;
$str =~ s/\cV/\cC0,1/;
my @chars = (split(//, $str));
my $ret = '';
my ($CCfc, $CCbc, $CCnc);
my $CCb = 0;
my $CCu = 0;
while (@chars) {
my $char = shift @chars;
if ($char eq "\cB") { $ret .= ($CCb ? '</b>' : '<b>'); $CCb = !$CCb; }
elsif ($char eq "\c_") { $ret .= ($CCu ? '</u>' : '<u>'); $CCu = !$CCu; }
elsif ($char eq "\cO") {
if ($CCb) { $CCb = 0; $ret .= '</b>'; }
if ($CCu) { $CCu = 0; $ret .= '</u>'; }
if ($CCnc) {
$CCbc= 0; $ret .= '</SPAN>' x $CCnc;
$CCfc= 0; $CCnc = 0; }
} elsif ($char eq "\cC") {
my $CCcs = '';
while (($char = shift @chars) =~ /[\d,]/) {
$CCcs .= $char;
}
if ($char =~ /[^\d,]/) {
unshift @chars, $char;
}
if ($CCcs eq '') {
if ($CCbc) { $ret .= '</SPAN>'; $CCnc--; }
if ($CCfc) { $ret .= '</SPAN>'; $CCnc--; }
$CCbc = undef; $CCfc = undef;
} else {
my @k = split /,/, $CCcs;
if (defined($k[0]) && $k[0] ne '') {
$CCfc = $k[0];
$ret .= "<SPAN CLASS=fc$CCfc>";
$any_styles++;
$CCnc++;
}
if (defined($k[1]) && $k[1] ne '') {
$CCbc = $k[1];
$ret .= "<SPAN CLASS=bc$CCbc>";
$any_styles++;
$CCnc++;
}
}
}
else {
$ret .= $char;
}
}
return $ret;
}
my $buffered_line = undef;
sub convert($) {
my($line) = @_;
if ($line =~ m/^[A-Za-z]/ &&
$line !~ m/^[^ \t\r\n:]+: /) {
$buffered_line = '' unless defined ($buffered_line);
$buffered_line .= $line;
return "";
} else {
my $ret = (defined ($buffered_line)
? convert_line ($buffered_line)
: "");
$buffered_line = $line;
return $ret;
}
}
# convert everything in the given string.
sub convert_body($) {
my ($body) = @_;
my $color_style = '';
if ($colorize_p) {
$color_style = " STYLE=\"background:$bg_color\"";
}
my $output = "{|border=\"0\" cellpadding=\"5\" cellspacing=\"0\" align=\"center\"\n";
$input_lines = 0;
foreach (split ("\n", $body)) {
$input_lines++;
$output .= convert ($_);
}
$output .= convert ("");
$output .= "|-\n|}";
my $ss = ($any_styles ? $stylesheet : "");
$_ = $doc_head;
s/%%STYLESHEET%%/$ss/;
$output = $_ . $output;
return $output;
}
# This is the main loop when we seem to have been invoked as a CGI script.
#
sub do_cgi() {
error (400, "malformed URL") if ($ENV{PATH_INFO} || $ENV{QUERY_STRING});
if ($ENV{REQUEST_METHOD} eq 'GET') {
#
# GET means this is the top-level invocation: just display the HTML
# for the form in which the user can paste an IRC log.
#
print STDOUT "Content-Type: text/html\n\n";
print STDOUT
("<HTML>\n" .
" <HEAD>\n" .
" <TITLE>$progname</TITLE>\n" .
" </HEAD>\n" .
" <BODY>\n" .
" <FORM METHOD=POST>\n" .
" <TABLE>\n" .
" <TR>\n" .
" <TD COLSPAN=2 VALIGN=TOP ALIGN=CENTER>\n" .
" <DIV ALIGN=CENTER>\n" .
" <B><SPAN STYLE='font-size:larger'>Chat log to HTML Converter<BR></SPAN>\n" .
" by <A HREF=\"http://www.jwz.org/\">" .
"Jamie Zawinski</A><BR>\n" .
" Version <A HREF=\"http://www.jwz.org/hacks/irc2html.pl\">" .
"$version</A>\n" .
" </SPAN>\n" .
" Paste your IRC/AIM log here:</B><BR>\n" .
" <TEXTAREA NAME=\"body\" ROWS=10 COLS=60 WRAP=OFF></TEXTAREA>\n" .
" </TD>\n" .
" </TR>\n" .
" <TR>\n" .
" <TD VALIGN=TOP ALIGN=LEFT NOWRAP>\n" .
" <INPUT NAME=\"strip_status\" TYPE=CHECKBOX>" .
" Strip Status Lines<BR>\n" .
" <INPUT NAME=\"strip_timestamps\" TYPE=CHECKBOX>" .
" Strip Timestamps<BR>\n" .
" <INPUT NAME=\"colorize\" TYPE=CHECKBOX CHECKED> Colorize\n" .
" </TD>\n" .
" <TD VALIGN=MIDDLE ALIGN=RIGHT NOWRAP>\n" .
" <INPUT TYPE=SUBMIT VALUE=\"Generate HTML\">\n" .
" \n" .
" <INPUT TYPE=RESET VALUE=\"Clear\">\n" .
" </TD>\n" .
" </TR>\n" .
" </TABLE>\n" .
" </FORM>\n" .
" </BODY>\n" .
"</HTML>\n");
} elsif ($ENV{REQUEST_METHOD} eq 'POST') {
#
# POST means that the form has been submitted; read the log text and
# print out the corresponding HTML.
#
my $args = '';
while (<>) { $args .= $_; }
$colorize_p = 0; # for cgi, must default to 0
my $body = '';
foreach (split ('&', $args)) {
my ($key, $val) = m/^(.*?)=(.*)$/s;
if ($key eq 'body') { $body .= $val; }
elsif ($key eq 'strip_status') { $strip_noise = ($val eq 'on'); }
elsif ($key eq 'strip_timestamps') { $strip_timestamps = ($val eq 'on');}
elsif ($key eq 'colorize') { $colorize_p = ($val eq 'on'); }
else { error (400, "unknown option $key"); }
}
$separators_p = !$colorize_p;
# $colorize_p = 1;####
$body =~ s/\+/ /g; # spaces are encoded
$body =~ s/%([\dA-F]{2})/chr(hex($1))/gexi; # de-hexilate
$body =~ s@\r\n@\n@sg; # CRLF -> LF
$body =~ s@\r@\n@sg; # CR -> LF
print STDOUT "Content-Type: text/html\n\n";
print STDOUT convert_body ($body);
} else {
error (400, "$ENV{REQUEST_METHOD} not supported");
}
}
sub error($$) {
my ($http_status, $err) = @_;
if (defined($ENV{REQUEST_URI})) {
print "Status: $http_status\n";
print "Content-Type: text/html\n";
print "\n<TITLE>Error</TITLE>\n";
print "<H1>$http_status</H1><P>\n";
$err =~ s/&/&/g;
$err =~ s/</</g;
$err =~ s/>/>/g;
print "$err\n<P>\n";
exit (0);
} else {
print STDERR "$progname: $err\n";
exit 1;
}
}
sub usage() {
print STDERR "usage: $progname [--verbose] [--strip] [--no-color]\n" .
" [input-chat-log] [output-html-file]\n";
exit 1;
}
sub main() {
my $in = undef;
my $out = undef;
if (defined ($ENV{REQUEST_URI})) {
return do_cgi();
}
while ($_ = $ARGV[0]) {
shift @ARGV;
if ($_ eq "--verbose") { $verbose++; }
elsif ($_ eq "--strip") { $strip_noise++; }
elsif ($_ eq "--no-time") { $strip_timestamps++; }
elsif ($_ eq "--no-color") { $colorize_p = 0; $separators_p = 1; }
elsif (m/^-v+$/) { $verbose += length($_)-1; }
elsif (m/^-./) { usage; }
elsif (!defined($in)) { $in = $_; }
elsif (!defined($out)) { $out = $_; }
else { usage; }
}
$in = "-" unless $in;
$out = "-" unless $out;
local *IN;
local *OUT;
open (IN, "<$in") || die "$progname: unable to read $in: $!\n";
open (OUT, ">$out") || die "$progname: unable to write $out: $!\n";
$in = "stdin" if $in eq "-";
$out = "stdout" if $out eq "-";
if ($verbose) {
print STDERR "$progname: converting $in to $out...\n";
}
my $body = '';
$body .= $_ while (<IN>);
print OUT convert_body ($body);
close OUT;
close IN;
if ($verbose) {
print STDERR "$progname: wrote $out ($output_lines lines).\n";
}
}
main();
exit 0;