Cast Of Characters

#!/usr/bin/perl
use strict;

# |
# | OMHPP
# |
# | The Omino HTML Preprocessor
# | 
# | Like many "simple" projects, this one, too, is born of the
# | traditional hubris:
# | 
# |    I can do something simple but sufficient for a traditionally complex task.
# | 
# | The task at hand is to support a small manually-maintained website where the
# | client can make changes to the content. The design goal is to give them a
# | simplified markup language that can be edited in a normal text editor.
# | 
# | The approach taken here is also traditional -- templates.
# | The templates are of exactly two types: flat files and roster files.
# | This combined with the ability to ::include:: other files gives a
# | moderately flexible toolset for building simple sites.
# |
# | The omhpp is intended to be used in either of
# | two ways:
# |       way one: As a server-side CGI script, processing templates on-demand
# |       way two: As a build-time preprocessor, where you might run "make"
# |                to construct your site.
# |
# | Expected file suffixes:
# |
# |  foofoo.hpp         -- a top level linkable page
# |  foofoo.hpp_include -- a file for inclusion to a linkable page
# |  foofoo.hpp_roster  -- a file with roster (table) elements
# |
# | Default file when running as CGI: index.hpp
# |

# | Directives and Markup
# |
# | # a comment line starts with a hash
# |
# | ::include any_filename:: 
# |     includes a file, presumed relative to the current file.
# |
# | [[any_url and some descriptive text]]
# |     becomes an <a href=""> link. If the url ends with .hpp or .hpp*, then
# |     the generated link includes the preprocessor invocation as well
# |
# | Simple Markup Shortcuts
# |     *any text to be bolded*       may have spaces before and after
# |     _any text to be italicized_   must have spaces before and after
# |
# |
# | Roster Structure
# |
# | A "roster" is a list of elements, each formatted similarly.
# | A roster file is divided into four distinct sections. Each
# | section is separated by a line containing five (or more)
# | equals signs. The last section contains all the elements.
# |
# | We proceed by example:
# |
# |    This is the header of the roster.
# |    <h1>Cast Of Characters</h1>
# |    <table>
# |    ===============
# |    # this part is repeated for each element
# |    <tr><td> *::name::* </td>
# |        <td> *::description::* </td>
# |    # optional field
# |    </tr>
# |    ===============
# |    </table>
# |    That was the table. <hr />
# |    ===============
# |    # now the elements of the roster...
# |    name:Nigel
# |    description: a detective
# |    --------
# |    name:Samantha
# |    description: a woman with a mysterious past
# |    --------
# |    name:Nigel
# |    description: Samantha's brother, who is also named Nigel
# |    --------
# |    name:Fred
# |    description: A bartender
# |    # end of file

# | Roster markup: blah blah *this is bold* blah
# |                hah hah _this is italic_ blah
# |    this that click [[here_or_there.com/that here]] to see what I mean
# |    (dont include http:// in the URL)


my $gDebugBlab = 0;
my $TOOL = $0; # who am i?

my $COLON_HOLDER = "---f23_COLON---";
my $BRACE_HOLDER = "---f23_BRACE---";
my $STARR_HOLDER = "---f23_STAR---";

sub errorBox
{
    my $e;
    $e .= "<pre style=\"margin:10 ; padding:10 ; border:2pt dashed red ; background-color:#ff8080\">\n";
    $e .= "<b>Error</b> $TOOL\n";
    while(@_)
    {
        my $x = shift;
        $e .= $x;
    }
    $e .= "\n</pre>\n";
    return $e;
}

sub fail
{
    my $e = errorBox(@_);
    print $e;
    #die;
}

my %gArgs;
my $gIsCGI = defined($ENV{QUERY_STRING});
my $gScriptName = $ENV{SCRIPT_NAME}; # if cgi-ing

sub fetchArguments()
{
    foreach my $arg (@ARGV)
    {
dPrint("arg: $arg");
        if($arg =~ /^--(.*?)=(.*)$/)
        {
            $gArgs{$1} = $2;
        }
    }

    my $cgiQueryString = $ENV{QUERY_STRING};
    my @cgiArgv = split(/&/,$cgiQueryString);
    foreach my $arg (@cgiArgv)
    {
        if($arg =~ /^(.*?)=(.*)$/)
        {
            $gArgs{$1} = $2;
            $gIsCGI = 1;
        }
    }

    $gDebugBlab = getArg("debug",$gDebugBlab);
}

sub getArg($$)
{
    my ($arg,$dflt) = (@_);
    my $result = $gArgs{$arg};
    if($result eq "")
    {
        $result = $dflt;
    }
    return $result;
}

# +---------------------
# | fetch the contents of a file
# | if the file is named "-" then
# | fetch stdin
# |
sub readFile($)
{
    my ($fileName) = (@_);
    my $fileContents = "";

dPrint("readFile: $fileName");

    my $fh;
    if($fileName eq "-")
    {
        $fh = *STDIN;
    }
    else
    {
dPrint ("readFile: opening $fileName");
        if(! -f $fileName)
        {
            return errorBox("cannot open $fileName");
        }
        open F,$fileName || fail("cannot open $fileName");
        $fh = *F;
    }
    while(my $a = <$fh>)
    {
        if($a !~ /^\#.*/)    # throw out comments
        {
            $fileContents .= $a;
        }
    }
dPrint ("readFile: read $fileName to get -- $fileContents");
    return $fileContents;
}

# +------
# | For debugging
sub dPrint(@)
{
    return unless $gDebugBlab;

    print "<!-- [dPrint] ";
    print @_," ";
    print "[/dPrint] -->\n";
}


# +------
# | For debugging
sub printArray($@)
{
    return unless $gDebugBlab;

    print "\n\n=========================\n";
    my $msg = shift;
    my $count = scalar(@_);
    print "$msg: $count\n";
    my $i = 0;
    foreach my $e (@_)
    {
        print "$i. $e\n";
        $i++;
    }
    print "\n";
}

# +------
# | For debugging
sub printHash($\%)
{
    return unless $gDebugBlab;

    print "\n\n=========================\n";
    my $msg = shift;
    my $hashRef = shift;
    my $count = scalar(%$hashRef);
    print "$msg: $count\n";
    my $i = 0;
    foreach my $e (sort(keys(%$hashRef)))
    {
        my $v = $$hashRef{$e};
        print "$i. $e : $v \n";
        $i++;
    }
}

my $rosterFieldNamePattern = "[a-zA-Z0-9-_]+";
my $cr = "[\n\r]";

# +-----------------
# | Given a roster section, return a
# | hash of the fields
# | Rosters look like:
# |
# |  fieldname: stuff
# |  fieldname: stuff
# |    stuff stuff stuff
# |  fieldname: stuff
# |
sub processRosterSection($)
{
    my $rosterSection = shift;
    my %sectionValues;


    while($rosterSection =~ /^\s*($rosterFieldNamePattern):(.*)$/s)
    {
        my $field = $1;
        my $value;
        my $restOfSection = $2;

        #
        # Is there another field after this one?
        if($restOfSection =~ /^(.*?)\n($rosterFieldNamePattern:.*)$/s)
        {
            $value = $1;
            $rosterSection = $2; # the rest of it
        }
        else
        {
            $value = $restOfSection; # all of it
            $rosterSection = "";
        }
    $sectionValues{$field} = $value;
    }

    return \%sectionValues;
}

# +------------------
# | Given the last part of a roster file,
# | break up the roster elements and return
# | an array of roster-field hashes.
# |
# | Roster elements look like:
# | 
# | -------
# | field: stuff
# |
# | -------
# | # another section
# | field: stuff
# |
sub processRoster($)
{
    my $rosterSection = shift;

    my @result;

    my @rosterSections = split(/${cr}-{5,}${cr}/,$rosterSection);
    printArray("roster sections: ",@rosterSections);
    foreach my $rosterSection (@rosterSections)
    {
        my $rosterSectionFields = processRosterSection($rosterSection);
        printHash("roster section",%$rosterSectionFields);
        push(@result,$rosterSectionFields);

    }
    return @result;
}
# +-----------------
# | given a directive and the current stack
# | of replacement mappings, perform it.
# | it may be a file insertion, or a table, &c &c
sub xxxprocessDirective($;@)
{
    my $directiveRaw = shift;
    my @replacements = @_;
    my $result = "";

dPrint("processDirective: $directiveRaw");

    my @directiveParts = split(/ +/,$directiveRaw);
    my $directive = shift(@directiveParts);

    if($directive eq "include")
    {
        my $includeFile = shift(@directiveParts);
        $result = processFile($includeFile,@replacements);
    }
    elsif($directive eq "href")
    {
        my $linkTo = shift(@directiveParts);
        if($gIsCGI)
        {
            $result = "$gScriptName?src=$linkTo";
        }
        else
        {
            # strip trailing .hpp, if that's the suffix
            if($linkTo =~ /^(.*)\.hpp$/)
            {
                $linkTo = $1;
            }
            $result = "$linkTo.html";
        }
    }
    else
    {
        $result = errorBox("<br/><br/>Unknown directive ::${directiveRaw}::");
    }
    return $result;
}

# +-------------------
# | Given a block of text and
# | optionally one or more hash fields,
# | do all the supported expansions, including
# | file- and table- inclusions.
# |
sub doIncludes($$;@)
{
    my $fileName = shift;
    my $sourceText = shift;
    my @replacements = @_;
    my $resultText = "";

    while($sourceText =~ /^(.*?)::include ([^\n\r]+?)::(.*)$/s)
    {
        $resultText .= $1;
        my $includeFileName = $2;
        $sourceText = $3;

dPrint("doIncludes, includeFileName=\"$includeFileName\"");
        $includeFileName = makePathRelative($fileName,$includeFileName);
        my $includeFileContents = readFile($includeFileName);
        $includeFileContents = processFile($includeFileName,@replacements);

        $resultText .= $includeFileContents;
    }
    # add on the tail...
    $resultText .= $sourceText;
dPrint("doIncludes: leaving with $resultText");
    return $resultText;
}

sub makePathRelative($$)
{
    my ($ownerPath,$includePath) = (@_);

    # if ownerpath is, say foo/bar.hpp, and includepath is baz.hpp,
    # then result is foo/baz.hpp.
    #
    # if ownerpath is foo/bar.hpp, and includepath is /baz.hpp,
    # result is /baz.hpp.

    if($includePath =~ /^\/.*$/)
    {
        return $includePath;
    }

    my $prefix = "";
    if($ownerPath =~ /^(.*\/).*?$/)
    {
        $prefix = $1;
    }

    my $result = $prefix . $includePath;
    return $result;
}


# +------------------
# | given a file name and optional stack
# | of replacements, process it.
# | decide whether it is flat or a roster
# |
sub processFile($;@)
{
    my $fileName = shift;
    my @replacements = @_;
    my $result = "";

    dPrint("processFile: $fileName");

    my $fileContents = readFile($fileName);

    # allow \: to be a non-interpreted colon
    $fileContents =~ s/\\:/$COLON_HOLDER/gs;
    $fileContents =~ s/\\\[/$BRACE_HOLDER/gs;
    $fileContents =~ s/\\\*/$STARR_HOLDER/gs;

    $fileContents = doMarkupShortcuts($fileName,$fileContents);

    if(length($fileContents) == 0)
    {
dPrint("processFile: empty or missing $fileName");
        $result = errorBox("empty or missing file: \"$fileName\"");
dPrint("processFile: result is $result");
    }
    elsif($fileName =~ /^.*\.hpp_roster$/)
    {
        $result = processRosterFile($fileName,$fileContents,@replacements);
    }
    else
    {
        $result = processFlatFile($fileName,$fileContents,@replacements);
    }

    # restore escaped characters, sans escape
	$result =~ s/$COLON_HOLDER/:/gs;
	$result =~ s/$BRACE_HOLDER/[/gs;
	$result =~ s/$STARR_HOLDER/*/gs;

    return $result;
}


# +-----------------
# | given a filename and an optional stack of
# | replacement hashes, read it in, perform
# | all substitutions and directives, and
# | return the result
# |

sub processFlatFile($$;@)
{
    my $fileName = shift;
    my $fileContents = shift;
    my @replacements = @_;

    my $result = doIncludes($fileName,$fileContents,@replacements);
    return $result;
}

sub processRosterFile($$;@)
{
    my $fileName = shift;
    my $fileContents = shift;
    my @replacements = @_;

    # 
    # The file is in four parts
    #   1. template for top
    #   2. template for repeated middle
    #   3. template for end
    #   4. roster elements
    #
    # split on =========='s.
    #
    my @fileParts = split(/${cr}={5,}${cr}/,$fileContents);
    printArray("file parts: ",@fileParts);
    my @rosterSections = processRoster($fileParts[3]);

    my $result = "";

    $result .= $fileParts[0]; # TOP of output

    # repeat for each roster-row

    foreach my $rosterSectionFields (@rosterSections)
    {
        my $html = $fileParts[1];
        # html is now in tempate form...
	
        foreach my $field (sort(keys(%$rosterSectionFields)))
        {
            my $value = $$rosterSectionFields{$field};
            my @valueSplit = split(/(${cr}){2,}/,$value);

            $value = "";
            for(my $ii = 0; $ii <= $#valueSplit; $ii++)
            {
                my $v = $valueSplit[$ii];
                next if $v =~ /^\s*$/;
                my $pp = $ii == $#valueSplit ? "" : "<p></p>"; #cr-cr = small gap?
                $pp = ""; # DISABLE the double-line p-breaking.
                $value .= "$v$pp\n";
            }
            # trim leading & trailing whitespace on value, for subs
            if($value =~ /^\s*(.*?)\s*$/)
            {
                $value = $1;
            }

        $html =~ s/::${field}::/$value/gs;

	    # remove conditionals for this field:
	    $html =~ s/\?\?${field}\?\?//gs;
        }

        # 
        # Clean up any stray unfound substitutions
        #
        $html =~ s/::[a-z-]+:://gs;

        #
        # Do the standard markup shortcuts, like 
        # *bold* and [[URL linktext]]
        #
        $html = doMarkupShortcuts($fileName,$html);
        #
        # Remove unsatisfied conditionals
        #
        $html =~ s/\?\?[a-z-]+\?\?.*?[\n\r]//gs;

        # and keep it.
        $result .= $html;
    }

    $result .= $fileParts[2]; # BOTTOM of output
    return $result;
}


# +----------------------------
# | Perform handy standard replacements
# | that ARE useful to html tags
sub doStandardReplacements($$)
{
    my ($fileName,$input) = (@_);
    my $src = $input;

    # ::thisdir:: becomes directory we're in, for relative
    my $thisDir = ".";
    if($fileName =~ /^(.*)\/.*?$/gs)
    {
        $thisDir = $1;
    }

    # ::up filename:: becomes filename or ../filename or ../../filename ...
    my $result = "";
    while($src =~ /^(.*?)::([^\n\r]*?)::(.*)$/gs)
    {
        my $textBefore = $1;
        my $aDirective = $2;
        my $textAfter = $3;
        dPrint("doStandardReplacements ::"."$aDirective"."::");
        # for directives with argument...
        my $directiveWord = "";
        my $directiveArg = "";
        if($aDirective =~ /^(.*?) (.*)$/)
        {
            $directiveWord = $1;
            $directiveArg = $2;
        }
            
        if($directiveWord eq "up")
        {
            #
            # find a file here or up from here,
            # expressed relative to omhpp.cgi
            #
            my $aFile = $directiveArg;
            my $upMax = 6;
            while((!-f "$thisDir/$aFile") && ($upMax-- > 0))
            {
                $aFile = "../$aFile";
                dPrint("up $directiveArg -- checking $thisDir/$aFile");
            }
            $aFile = "missing/$directiveArg" if !-f "$thisDir/$aFile";
            $result .= "$textBefore$thisDir/$aFile";
            $src = $textAfter;
        }
        elsif ($directiveWord eq "upr")
        {
            #
            # find a file here or up from here,
            # expressed relative to the hpp file
            # (same algorithm as "up", but
            # replace without prefix to hpp-containing dir
            #
            my $aFile = $directiveArg;
            my $upMax = 6;
            while((!-f "$thisDir/$aFile") && ($upMax-- > 0))
            {
                $aFile = "../$aFile";
            }
            $aFile = "missingr/$directiveArg" if !-f "$thisDir/$aFile";
            $result .= "$textBefore$aFile";
            $src = $textAfter;
        }
        elsif ($aDirective eq "thisdir")
        {
            $result .= "$textBefore$thisDir";
            $src = $textAfter;
        } else
        {
            $result .= $textBefore . "::" . $aDirective . "::";
            $src = $textAfter;
        }
        dPrint("doStandardReplacements result == $result");
    }
    $result .= $src;

    return $result;
}


# +----------------------------
# | Perform any handy quote-fixing, *bolding*, and so on.
# | these fixups should NOT be done on html tags
sub doMarkupShortcuts($$)
{
    my ($fileName,$input) = (@_);
    my $src = $input;

    $src = doStandardReplacements($fileName,$src);

    my $srcHtmlized = "";
    while($src =~ /^(.*?)\[\[([^ ]+?)\s(.+?)\]\](.*)$/s)
    {
dPrint("htmlizing src=$src");
        my $textBefore = $1;
        my $url = $2;
        my $urlText = $3;
        $src = $4;

        # IF its an hpp reference, handle specially
        # to retain links
        # TODO: flag to change it into html-ref, if using
        # as a build-time preprocessor
        # leading backslash means, treat hpp literally, dont cgi it
        my $okToEscape = 1;
        if($url =~ /^\\(.*)$/)
        {
            $url = $1;
            $okToEscape = 0;
        }

        $url = makePathRelative($fileName,$url);
        if($okToEscape && $url =~ /(.*)\.hpp/)
        {
            $url = "$TOOL?src=$url";
        }
        $srcHtmlized .= $textBefore . "<a href=\"$url\">$urlText</a>";
    }
    $srcHtmlized .= $src;
    $src = $srcHtmlized;
    $srcHtmlized = ""; # done.

    ###$src =~ s/\[\[([^ ]+?)\s(.+?)\]\]/<a href="http:$1">$2<\/a>/gs;

    # start by protecting things inside of <markup elements>,
    # so they don't get formatted
    my @chunks;
    while($src =~ /^(.*?)(<.*?>)(.*)$/s)
    {
        my $openText = $1;
        my $markupElement = $2;
        $src = $3;
dPrint "openText:$openText";
dPrint "markupElement:$markupElement";
        push(@chunks,$openText);
        push(@chunks,$markupElement);
    }
dPrint "src:$src";
    push(@chunks,$src);

    my $result = "";
    # now do each chunk, leaving markup elements intact
    foreach my $value (@chunks)
    {
dPrint "chunks: value = $value";
        if($value =~ /^<.*>$/)
        {
            # markup chunk? just append it...
            $result .= $value;
            next;
        }
dPrint "a: $value";
        #
        # Bold things in two *asterisks* on same line
        # italicized things in _two underscores_ on the same line
        #
        $value =~ s/\*([^\n]+?)\*/<b>$1<\/b>/gs;
        $value =~ s/ \_([^\n]+?)\_ / <i>$1<\/i> /gs;

dPrint "b: $value";
        #
        # General quoting (what about attributes, oops)
        #

        # 2007.03.06 dvb I decided: I dont like quoting
        # it interferes too much with stylesheets, javascript, &c.
        #$value =~ s/(\s)\"/$1&ldquo;/gs;
        #$value =~ s/\"/&rdquo;/gs;
        #$value =~ s/([a-zA-Z])\'s/$1&rsquo;s/gs;

        # special chars, too

        #$value =~ s/\xd5/&rsquo;/gs;

        $result .= $value;

    }

dPrint("doMarkupShortcuts $input returns $result");
    return $result;
}

my $passCount = 0;
my $failCount = 0;
sub assertEq($$$)
{
    my($msg,$v1,$v2) = (@_);
    if(!($v1 eq $v2))
    {
        print("ERROR mismatch $msg: \nwant $v1\ngot  $v2\n");
        $failCount++;
    }
    else
    {
        $passCount++;
    }
}

sub assertRelativePath($$$)
{
    my ($path,$includePath,$expectedResultPath) = (@_);
    my $result = makePathRelative($path,$includePath);
    assertEq("relative path expected $expectedResultPath got $result",$expectedResultPath,$result);
}

# miscellaneous runtime tests, invoke with "TEST" as the argument
sub tests()
{
    $gDebugBlab = 1;
    my $value = "this *is* a test";
    my $result = doMarkupShortcuts("",$value);
    assertEq("bold asterisks","this <b>is</b> a test",$result);

    $result = doMarkupShortcuts("","what _about_ it?");
    assertEq("underlines ","what <i>about</i> it?",$result);

    $result = doMarkupShortcuts("","<div> my link: [[http://omino.com omino]].</div>");
    assertEq("href ","<div> my link: <a href=\"http://omino.com\">omino</a>.</div>",$result);

    $result = doMarkupShortcuts("","<div class=\"foo\"> my link: [[http://omino.com omino]].</div>");
    assertEq("href & attributes ","<div class=\"foo\"> my link: <a href=\"http://omino.com\">omino</a>.</div>",$result);

    $result = doMarkupShortcuts("","[[foo.hpp go]]");
    assertEq("hpp href","<a href=\"$TOOL?src=foo.hpp\">go</a>",$result);

    $result = doMarkupShortcuts("here/doc.hpp","[[deeper/foo.hpp go]]");
    assertEq("hpp href","<a href=\"$TOOL?src=here/deeper/foo.hpp\">go</a>",$result);

    # check use of leading \ on url to prevent cgi-ifying
    $result = doMarkupShortcuts("here/doc.hpp","[[\\deeper/foo.hpp go]]");
    assertEq("escaped hpp href","<a href=\"here/deeper/foo.hpp\">go</a>",$result);

    # check ::up foo::
    $result = doMarkupShortcuts("here/doc.hpp","blah blah <img src=\"::up foo::\">");
    assertEq("::up foo:: failed","blah blah <img src=\"missing/foo\">",$result);

    # check ::upr foo::
    $result = doMarkupShortcuts("here/doc.hpp","blah blah <img src=\"::upr foo::\">");
    assertEq("::upr foo:: failed","blah blah <img src=\"missingr/foo\">",$result);

    # check ::thisdir::
    $result = doMarkupShortcuts("dir1/dir2/foo.hpp","x::thisdir::y");
    assertEq("::thisdir:: failed","xdir1/dir2y",$result);

    # check ::thisdir:: and safety
    $result = doMarkupShortcuts("dir1/dir2/foo.hpp","::include x:: x::thisdir::y");
    assertEq("::thisdir:: destroyed failed","::include x:: xdir1/dir2y",$result);

    # works for multiple lines?
    $value = <<EOP
ok <div
class="fish">
  what about *this* ?
</div>
out
EOP
;

   my $expected = <<EOP
ok <div
class="fish">
  what about <b>this</b> ?
</div>
out
EOP
;
   $result = doMarkupShortcuts("",$value);
   assertEq("line breaks",$expected,$result);


# relative path tests
    assertRelativePath("here.hpp","subdir/there.hpp","subdir/there.hpp");
    assertRelativePath("subdir/here.hpp","subdir2/there.hpp","subdir/subdir2/there.hpp");
    assertRelativePath("subdir/here.hpp","/subdir2/there.hpp","/subdir2/there.hpp");

    # REPORT the results.
    print "$passCount tests pass\n";
    print "$failCount tests fail\n";
}

sub main(@)
{
    $TOOL = $0; # who am i?
    $TOOL = $1 if($TOOL =~ /^.*\/(.*)?$/); # just the name, url is relative to here.

    if( $_[0] eq "TEST")
    {
        tests();
        exit(0);
    }
    print "content-type: text/html\n\n" if $gIsCGI;
    fetchArguments();
    my $sourceFileName = getArg("src","index.hpp");

    if($sourceFileName =~ /^.*\/$/) # ends with slash?
    {
    	$sourceFileName .= "index.hpp";
    }

    my $result = processFile($sourceFileName);
    print $result;

    print "\n";
}

main(@ARGV);

#eof