#!/usr/bin/perl
use strict;
# |
# | OMHPP
# |
# | The Omino HTML Preprocessor
# |
# | Like many "simple" projects, this one, too, is born of the
# | traditional hubris:
# |
# | I can do something simple but sufficient for a traditionally complex task.
# |
# | The task at hand is to support a small manually-maintained website where the
# | client can make changes to the content. The design goal is to give them a
# | simplified markup language that can be edited in a normal text editor.
# |
# | The approach taken here is also traditional -- templates.
# | The templates are of exactly two types: flat files and roster files.
# | This combined with the ability to ::include:: other files gives a
# | moderately flexible toolset for building simple sites.
# |
# | The omhpp is intended to be used in either of
# | two ways:
# | way one: As a server-side CGI script, processing templates on-demand
# | way two: As a build-time preprocessor, where you might run "make"
# | to construct your site.
# |
# | Expected file suffixes:
# |
# | foofoo.hpp -- a top level linkable page
# | foofoo.hpp_include -- a file for inclusion to a linkable page
# | foofoo.hpp_roster -- a file with roster (table) elements
# |
# | Default file when running as CGI: index.hpp
# |
# | Directives and Markup
# |
# | # a comment line starts with a hash
# |
# | ::include any_filename::
# | includes a file, presumed relative to the current file.
# |
# | [[any_url and some descriptive text]]
# | becomes an link. If the url ends with .hpp or .hpp*, then
# | the generated link includes the preprocessor invocation as well
# |
# | Simple Markup Shortcuts
# | *any text to be bolded* may have spaces before and after
# | _any text to be italicized_ must have spaces before and after
# |
# |
# | Roster Structure
# |
# | A "roster" is a list of elements, each formatted similarly.
# | A roster file is divided into four distinct sections. Each
# | section is separated by a line containing five (or more)
# | equals signs. The last section contains all the elements.
# |
# | We proceed by example:
# |
# | This is the header of the roster.
# | Cast Of Characters
# |
# | ===============
# | # this part is repeated for each element
# | *::name::* |
# | *::description::* |
# | # optional field
# |
# | ===============
# |
# | That was the table.
# | ===============
# | # now the elements of the roster...
# | name:Nigel
# | description: a detective
# | --------
# | name:Samantha
# | description: a woman with a mysterious past
# | --------
# | name:Nigel
# | description: Samantha's brother, who is also named Nigel
# | --------
# | name:Fred
# | description: A bartender
# | # end of file
# | Roster markup: blah blah *this is bold* blah
# | hah hah _this is italic_ blah
# | this that click [[here_or_there.com/that here]] to see what I mean
# | (dont include http:// in the URL)
my $gDebugBlab = 0;
my $TOOL = $0; # who am i?
my $COLON_HOLDER = "---f23_COLON---";
my $BRACE_HOLDER = "---f23_BRACE---";
my $STARR_HOLDER = "---f23_STAR---";
sub errorBox
{
my $e;
$e .= "\n";
$e .= "Error $TOOL\n";
while(@_)
{
my $x = shift;
$e .= $x;
}
$e .= "\n
\n";
return $e;
}
sub fail
{
my $e = errorBox(@_);
print $e;
#die;
}
my %gArgs;
my $gIsCGI = defined($ENV{QUERY_STRING});
my $gScriptName = $ENV{SCRIPT_NAME}; # if cgi-ing
sub fetchArguments()
{
foreach my $arg (@ARGV)
{
dPrint("arg: $arg");
if($arg =~ /^--(.*?)=(.*)$/)
{
$gArgs{$1} = $2;
}
}
my $cgiQueryString = $ENV{QUERY_STRING};
my @cgiArgv = split(/&/,$cgiQueryString);
foreach my $arg (@cgiArgv)
{
if($arg =~ /^(.*?)=(.*)$/)
{
$gArgs{$1} = $2;
$gIsCGI = 1;
}
}
$gDebugBlab = getArg("debug",$gDebugBlab);
}
sub getArg($$)
{
my ($arg,$dflt) = (@_);
my $result = $gArgs{$arg};
if($result eq "")
{
$result = $dflt;
}
return $result;
}
# +---------------------
# | fetch the contents of a file
# | if the file is named "-" then
# | fetch stdin
# |
sub readFile($)
{
my ($fileName) = (@_);
my $fileContents = "";
dPrint("readFile: $fileName");
my $fh;
if($fileName eq "-")
{
$fh = *STDIN;
}
else
{
dPrint ("readFile: opening $fileName");
if(! -f $fileName)
{
return errorBox("cannot open $fileName");
}
open F,$fileName || fail("cannot open $fileName");
$fh = *F;
}
while(my $a = <$fh>)
{
if($a !~ /^\#.*/) # throw out comments
{
$fileContents .= $a;
}
}
dPrint ("readFile: read $fileName to get -- $fileContents");
return $fileContents;
}
# +------
# | For debugging
sub dPrint(@)
{
return unless $gDebugBlab;
print "\n";
}
# +------
# | For debugging
sub printArray($@)
{
return unless $gDebugBlab;
print "\n\n=========================\n";
my $msg = shift;
my $count = scalar(@_);
print "$msg: $count\n";
my $i = 0;
foreach my $e (@_)
{
print "$i. $e\n";
$i++;
}
print "\n";
}
# +------
# | For debugging
sub printHash($\%)
{
return unless $gDebugBlab;
print "\n\n=========================\n";
my $msg = shift;
my $hashRef = shift;
my $count = scalar(%$hashRef);
print "$msg: $count\n";
my $i = 0;
foreach my $e (sort(keys(%$hashRef)))
{
my $v = $$hashRef{$e};
print "$i. $e : $v \n";
$i++;
}
}
my $rosterFieldNamePattern = "[a-zA-Z0-9-_]+";
my $cr = "[\n\r]";
# +-----------------
# | Given a roster section, return a
# | hash of the fields
# | Rosters look like:
# |
# | fieldname: stuff
# | fieldname: stuff
# | stuff stuff stuff
# | fieldname: stuff
# |
sub processRosterSection($)
{
my $rosterSection = shift;
my %sectionValues;
while($rosterSection =~ /^\s*($rosterFieldNamePattern):(.*)$/s)
{
my $field = $1;
my $value;
my $restOfSection = $2;
#
# Is there another field after this one?
if($restOfSection =~ /^(.*?)\n($rosterFieldNamePattern:.*)$/s)
{
$value = $1;
$rosterSection = $2; # the rest of it
}
else
{
$value = $restOfSection; # all of it
$rosterSection = "";
}
$sectionValues{$field} = $value;
}
return \%sectionValues;
}
# +------------------
# | Given the last part of a roster file,
# | break up the roster elements and return
# | an array of roster-field hashes.
# |
# | Roster elements look like:
# |
# | -------
# | field: stuff
# |
# | -------
# | # another section
# | field: stuff
# |
sub processRoster($)
{
my $rosterSection = shift;
my @result;
my @rosterSections = split(/${cr}-{5,}${cr}/,$rosterSection);
printArray("roster sections: ",@rosterSections);
foreach my $rosterSection (@rosterSections)
{
my $rosterSectionFields = processRosterSection($rosterSection);
printHash("roster section",%$rosterSectionFields);
push(@result,$rosterSectionFields);
}
return @result;
}
# +-----------------
# | given a directive and the current stack
# | of replacement mappings, perform it.
# | it may be a file insertion, or a table, &c &c
sub xxxprocessDirective($;@)
{
my $directiveRaw = shift;
my @replacements = @_;
my $result = "";
dPrint("processDirective: $directiveRaw");
my @directiveParts = split(/ +/,$directiveRaw);
my $directive = shift(@directiveParts);
if($directive eq "include")
{
my $includeFile = shift(@directiveParts);
$result = processFile($includeFile,@replacements);
}
elsif($directive eq "href")
{
my $linkTo = shift(@directiveParts);
if($gIsCGI)
{
$result = "$gScriptName?src=$linkTo";
}
else
{
# strip trailing .hpp, if that's the suffix
if($linkTo =~ /^(.*)\.hpp$/)
{
$linkTo = $1;
}
$result = "$linkTo.html";
}
}
else
{
$result = errorBox("
Unknown directive ::${directiveRaw}::");
}
return $result;
}
# +-------------------
# | Given a block of text and
# | optionally one or more hash fields,
# | do all the supported expansions, including
# | file- and table- inclusions.
# |
sub doIncludes($$;@)
{
my $fileName = shift;
my $sourceText = shift;
my @replacements = @_;
my $resultText = "";
while($sourceText =~ /^(.*?)::include ([^\n\r]+?)::(.*)$/s)
{
$resultText .= $1;
my $includeFileName = $2;
$sourceText = $3;
dPrint("doIncludes, includeFileName=\"$includeFileName\"");
$includeFileName = makePathRelative($fileName,$includeFileName);
my $includeFileContents = readFile($includeFileName);
$includeFileContents = processFile($includeFileName,@replacements);
$resultText .= $includeFileContents;
}
# add on the tail...
$resultText .= $sourceText;
dPrint("doIncludes: leaving with $resultText");
return $resultText;
}
sub makePathRelative($$)
{
my ($ownerPath,$includePath) = (@_);
# if ownerpath is, say foo/bar.hpp, and includepath is baz.hpp,
# then result is foo/baz.hpp.
#
# if ownerpath is foo/bar.hpp, and includepath is /baz.hpp,
# result is /baz.hpp.
if($includePath =~ /^\/.*$/)
{
return $includePath;
}
my $prefix = "";
if($ownerPath =~ /^(.*\/).*?$/)
{
$prefix = $1;
}
my $result = $prefix . $includePath;
return $result;
}
# +------------------
# | given a file name and optional stack
# | of replacements, process it.
# | decide whether it is flat or a roster
# |
sub processFile($;@)
{
my $fileName = shift;
my @replacements = @_;
my $result = "";
dPrint("processFile: $fileName");
my $fileContents = readFile($fileName);
# allow \: to be a non-interpreted colon
$fileContents =~ s/\\:/$COLON_HOLDER/gs;
$fileContents =~ s/\\\[/$BRACE_HOLDER/gs;
$fileContents =~ s/\\\*/$STARR_HOLDER/gs;
$fileContents = doMarkupShortcuts($fileName,$fileContents);
if(length($fileContents) == 0)
{
dPrint("processFile: empty or missing $fileName");
$result = errorBox("empty or missing file: \"$fileName\"");
dPrint("processFile: result is $result");
}
elsif($fileName =~ /^.*\.hpp_roster$/)
{
$result = processRosterFile($fileName,$fileContents,@replacements);
}
else
{
$result = processFlatFile($fileName,$fileContents,@replacements);
}
# restore escaped characters, sans escape
$result =~ s/$COLON_HOLDER/:/gs;
$result =~ s/$BRACE_HOLDER/[/gs;
$result =~ s/$STARR_HOLDER/*/gs;
return $result;
}
# +-----------------
# | given a filename and an optional stack of
# | replacement hashes, read it in, perform
# | all substitutions and directives, and
# | return the result
# |
sub processFlatFile($$;@)
{
my $fileName = shift;
my $fileContents = shift;
my @replacements = @_;
my $result = doIncludes($fileName,$fileContents,@replacements);
return $result;
}
sub processRosterFile($$;@)
{
my $fileName = shift;
my $fileContents = shift;
my @replacements = @_;
#
# The file is in four parts
# 1. template for top
# 2. template for repeated middle
# 3. template for end
# 4. roster elements
#
# split on =========='s.
#
my @fileParts = split(/${cr}={5,}${cr}/,$fileContents);
printArray("file parts: ",@fileParts);
my @rosterSections = processRoster($fileParts[3]);
my $result = "";
$result .= $fileParts[0]; # TOP of output
# repeat for each roster-row
foreach my $rosterSectionFields (@rosterSections)
{
my $html = $fileParts[1];
# html is now in tempate form...
foreach my $field (sort(keys(%$rosterSectionFields)))
{
my $value = $$rosterSectionFields{$field};
my @valueSplit = split(/(${cr}){2,}/,$value);
$value = "";
for(my $ii = 0; $ii <= $#valueSplit; $ii++)
{
my $v = $valueSplit[$ii];
next if $v =~ /^\s*$/;
my $pp = $ii == $#valueSplit ? "" : ""; #cr-cr = small gap?
$pp = ""; # DISABLE the double-line p-breaking.
$value .= "$v$pp\n";
}
# trim leading & trailing whitespace on value, for subs
if($value =~ /^\s*(.*?)\s*$/)
{
$value = $1;
}
$html =~ s/::${field}::/$value/gs;
# remove conditionals for this field:
$html =~ s/\?\?${field}\?\?//gs;
}
#
# Clean up any stray unfound substitutions
#
$html =~ s/::[a-z-]+:://gs;
#
# Do the standard markup shortcuts, like
# *bold* and [[URL linktext]]
#
$html = doMarkupShortcuts($fileName,$html);
#
# Remove unsatisfied conditionals
#
$html =~ s/\?\?[a-z-]+\?\?.*?[\n\r]//gs;
# and keep it.
$result .= $html;
}
$result .= $fileParts[2]; # BOTTOM of output
return $result;
}
# +----------------------------
# | Perform handy standard replacements
# | that ARE useful to html tags
sub doStandardReplacements($$)
{
my ($fileName,$input) = (@_);
my $src = $input;
# ::thisdir:: becomes directory we're in, for relative
my $thisDir = ".";
if($fileName =~ /^(.*)\/.*?$/gs)
{
$thisDir = $1;
}
# ::up filename:: becomes filename or ../filename or ../../filename ...
my $result = "";
while($src =~ /^(.*?)::([^\n\r]*?)::(.*)$/gs)
{
my $textBefore = $1;
my $aDirective = $2;
my $textAfter = $3;
dPrint("doStandardReplacements ::"."$aDirective"."::");
# for directives with argument...
my $directiveWord = "";
my $directiveArg = "";
if($aDirective =~ /^(.*?) (.*)$/)
{
$directiveWord = $1;
$directiveArg = $2;
}
if($directiveWord eq "up")
{
#
# find a file here or up from here,
# expressed relative to omhpp.cgi
#
my $aFile = $directiveArg;
my $upMax = 6;
while((!-f "$thisDir/$aFile") && ($upMax-- > 0))
{
$aFile = "../$aFile";
dPrint("up $directiveArg -- checking $thisDir/$aFile");
}
$aFile = "missing/$directiveArg" if !-f "$thisDir/$aFile";
$result .= "$textBefore$thisDir/$aFile";
$src = $textAfter;
}
elsif ($directiveWord eq "upr")
{
#
# find a file here or up from here,
# expressed relative to the hpp file
# (same algorithm as "up", but
# replace without prefix to hpp-containing dir
#
my $aFile = $directiveArg;
my $upMax = 6;
while((!-f "$thisDir/$aFile") && ($upMax-- > 0))
{
$aFile = "../$aFile";
}
$aFile = "missingr/$directiveArg" if !-f "$thisDir/$aFile";
$result .= "$textBefore$aFile";
$src = $textAfter;
}
elsif ($aDirective eq "thisdir")
{
$result .= "$textBefore$thisDir";
$src = $textAfter;
} else
{
$result .= $textBefore . "::" . $aDirective . "::";
$src = $textAfter;
}
dPrint("doStandardReplacements result == $result");
}
$result .= $src;
return $result;
}
# +----------------------------
# | Perform any handy quote-fixing, *bolding*, and so on.
# | these fixups should NOT be done on html tags
sub doMarkupShortcuts($$)
{
my ($fileName,$input) = (@_);
my $src = $input;
$src = doStandardReplacements($fileName,$src);
my $srcHtmlized = "";
while($src =~ /^(.*?)\[\[([^ ]+?)\s(.+?)\]\](.*)$/s)
{
dPrint("htmlizing src=$src");
my $textBefore = $1;
my $url = $2;
my $urlText = $3;
$src = $4;
# IF its an hpp reference, handle specially
# to retain links
# TODO: flag to change it into html-ref, if using
# as a build-time preprocessor
# leading backslash means, treat hpp literally, dont cgi it
my $okToEscape = 1;
if($url =~ /^\\(.*)$/)
{
$url = $1;
$okToEscape = 0;
}
$url = makePathRelative($fileName,$url);
if($okToEscape && $url =~ /(.*)\.hpp/)
{
$url = "$TOOL?src=$url";
}
$srcHtmlized .= $textBefore . "$urlText";
}
$srcHtmlized .= $src;
$src = $srcHtmlized;
$srcHtmlized = ""; # done.
###$src =~ s/\[\[([^ ]+?)\s(.+?)\]\]/$2<\/a>/gs;
# start by protecting things inside of ,
# so they don't get formatted
my @chunks;
while($src =~ /^(.*?)(<.*?>)(.*)$/s)
{
my $openText = $1;
my $markupElement = $2;
$src = $3;
dPrint "openText:$openText";
dPrint "markupElement:$markupElement";
push(@chunks,$openText);
push(@chunks,$markupElement);
}
dPrint "src:$src";
push(@chunks,$src);
my $result = "";
# now do each chunk, leaving markup elements intact
foreach my $value (@chunks)
{
dPrint "chunks: value = $value";
if($value =~ /^<.*>$/)
{
# markup chunk? just append it...
$result .= $value;
next;
}
dPrint "a: $value";
#
# Bold things in two *asterisks* on same line
# italicized things in _two underscores_ on the same line
#
$value =~ s/\*([^\n]+?)\*/$1<\/b>/gs;
$value =~ s/ \_([^\n]+?)\_ / $1<\/i> /gs;
dPrint "b: $value";
#
# General quoting (what about attributes, oops)
#
# 2007.03.06 dvb I decided: I dont like quoting
# it interferes too much with stylesheets, javascript, &c.
#$value =~ s/(\s)\"/$1“/gs;
#$value =~ s/\"/”/gs;
#$value =~ s/([a-zA-Z])\'s/$1’s/gs;
# special chars, too
#$value =~ s/\xd5/’/gs;
$result .= $value;
}
dPrint("doMarkupShortcuts $input returns $result");
return $result;
}
my $passCount = 0;
my $failCount = 0;
sub assertEq($$$)
{
my($msg,$v1,$v2) = (@_);
if(!($v1 eq $v2))
{
print("ERROR mismatch $msg: \nwant $v1\ngot $v2\n");
$failCount++;
}
else
{
$passCount++;
}
}
sub assertRelativePath($$$)
{
my ($path,$includePath,$expectedResultPath) = (@_);
my $result = makePathRelative($path,$includePath);
assertEq("relative path expected $expectedResultPath got $result",$expectedResultPath,$result);
}
# miscellaneous runtime tests, invoke with "TEST" as the argument
sub tests()
{
$gDebugBlab = 1;
my $value = "this *is* a test";
my $result = doMarkupShortcuts("",$value);
assertEq("bold asterisks","this is a test",$result);
$result = doMarkupShortcuts("","what _about_ it?");
assertEq("underlines ","what about it?",$result);
$result = doMarkupShortcuts(""," my link: [[http://omino.com omino]].
");
assertEq("href ","",$result);
$result = doMarkupShortcuts(""," my link: [[http://omino.com omino]].
");
assertEq("href & attributes ","",$result);
$result = doMarkupShortcuts("","[[foo.hpp go]]");
assertEq("hpp href","go",$result);
$result = doMarkupShortcuts("here/doc.hpp","[[deeper/foo.hpp go]]");
assertEq("hpp href","go",$result);
# check use of leading \ on url to prevent cgi-ifying
$result = doMarkupShortcuts("here/doc.hpp","[[\\deeper/foo.hpp go]]");
assertEq("escaped hpp href","go",$result);
# check ::up foo::
$result = doMarkupShortcuts("here/doc.hpp","blah blah ");
assertEq("::up foo:: failed","blah blah ",$result);
# check ::upr foo::
$result = doMarkupShortcuts("here/doc.hpp","blah blah ");
assertEq("::upr foo:: failed","blah blah ",$result);
# check ::thisdir::
$result = doMarkupShortcuts("dir1/dir2/foo.hpp","x::thisdir::y");
assertEq("::thisdir:: failed","xdir1/dir2y",$result);
# check ::thisdir:: and safety
$result = doMarkupShortcuts("dir1/dir2/foo.hpp","::include x:: x::thisdir::y");
assertEq("::thisdir:: destroyed failed","::include x:: xdir1/dir2y",$result);
# works for multiple lines?
$value = <
what about *this* ?
out
EOP
;
my $expected = <
what about this ?
out
EOP
;
$result = doMarkupShortcuts("",$value);
assertEq("line breaks",$expected,$result);
# relative path tests
assertRelativePath("here.hpp","subdir/there.hpp","subdir/there.hpp");
assertRelativePath("subdir/here.hpp","subdir2/there.hpp","subdir/subdir2/there.hpp");
assertRelativePath("subdir/here.hpp","/subdir2/there.hpp","/subdir2/there.hpp");
# REPORT the results.
print "$passCount tests pass\n";
print "$failCount tests fail\n";
}
sub main(@)
{
$TOOL = $0; # who am i?
$TOOL = $1 if($TOOL =~ /^.*\/(.*)?$/); # just the name, url is relative to here.
if( $_[0] eq "TEST")
{
tests();
exit(0);
}
print "content-type: text/html\n\n" if $gIsCGI;
fetchArguments();
my $sourceFileName = getArg("src","index.hpp");
if($sourceFileName =~ /^.*\/$/) # ends with slash?
{
$sourceFileName .= "index.hpp";
}
my $result = processFile($sourceFileName);
print $result;
print "\n";
}
main(@ARGV);
#eof