#!/usr/local/bin/perl
#xtraclnk.pl: Extracts hypertext links from HTML files; isolates text contained
#             in <A> and <TITLE> elements.
#
# Typical use:
#
#   perl xtraclnk.pl [options] infiles.html > outfile
#
# Where options have the form "option=value", as discussed below (command line
# options other than ``title='' and ``loc='' work the same way as those of the
# htmlchek program in this distribution).
#
# Whenever xtraclnk.pl encounters an <A HREF="URL">Text</A> link in an input
# file, it copies this to the output.  Whenever xtraclnk.pl encounters an
# <A NAME="name">Text</A> anchor in an input file, it copies this as an
# <A HREF="currentfile.html#name">Text</A> link _to_ the current input file.
# Finally, the contents of a <TITLE>Text</TITLE> element are copied as an
# <A HREF="currentfile.html">Text</A> link _to_ the current input file.
# Each link in the ouput occupies exactly one line.
#
# This program was suggested by an idea of John Harper; what he had in mind,
# I think, was to use this as part of a CGI script which would dynamically
# construct an HTML document with links to all files with a title or anchors
# that contain text matching a user-specified search pattern.  However,
# xtraclnk.pl also has some value as an HTML style debugging tool: if you have
# used a lot of context-dependent titles like "Intro" and meaningless anchor
# text like "Click Here", this will be very apparent when you view the HTML
# document (derived with xtraclnk.pl using the ``title='' option) which
# contains only the text inside titles and anchors in your other HTML
# documents.  This program can also be used to enforce consistency in link text:
# if there is random variation between different <A HREF="...">LinkText</A>
# elements which all point towards the same resource, this will be apparent
# when the output of xtraclnk.pl is sorted.  Also, by looking over the sorted
# output of <tt>xtraclnk.pl</tt>, it becomes relatively easy to detect mistaken
# links, that point to someplace other than what was intended.
#
#  If you apply xtraclnk.pl to a list of filenames that are all specified
# relative to the current directory then all the references to files in
# subordinate directories will be expressed from the point of view of the top
# directory (i.e. relative URL pathnames will have the current directory as
# starting point).  Under Unix, you can use:
#
#   perl xtraclnk.pl `find . -name \*.html -print` > output
#
# Since xtraclnk.pl is a hacked-down version of the htmlchek error checker, it
# is rather robust in its handling of incorrect HTML code (but it generally has
# the same limitations that htmlchek does with metachar=2).  Though it is not
# a general-purpose error checker like htmlchek, xtraclnk.pl does return
# errormessages about HTML errors connected with its functioning (note that it
# ignores all tags in  a file except <A>, <BASE>, <TITLE>, and the
# ALT="..." attribute valuex of <IMG>).
#
# Command-line
#   options:
#
#  dirprefix=...     A string to be prefixed to URL's in the output links, in
#                 order to resolve relative URL's into absolute URL's.
#                 (See the htmlchek documentation for the complexities of use.)
#
#  usebase=1         Take the prefix from a <BASE HREF="..."> tag in each file.
#
#  sugar=1           Use the Unix ``filename: linenumber:'' format in reporting
#                 errors.
#
#  title=...         Make the output file a valid HTML document, with <br> at
#                 the end of each line, and a title as specified.  Error
#                 messages (if any) appear as HTML comments in the outputfile.
#                 (If this title= option is not specified on the command line,
#                 the output will tailored for human readability, and will not
#                 really be an HTML file.)
#                    Note that the output with title= will still be a HTML file
#                 if you run it though the ``sort'' and ``uniq'' filters.  It
#                 will also remain HTML if you run it through ``grep'' -- as
#                 long as you keep the first and last lines; for example (under
#                 Unix):
#                         perl xtraclnk.pl title="Link Stuff" *.html > out
#                         head -1 out > linkfile.html
#                         egrep 'pattern' out >> linkfile.html
#                         tail -1 out >> linkfile.html
#
#  loc=...           Whether or not to include the location (input filename and
#                 linenumber) from which each output link is derived.  If you
#                 don't include locations, it's hard to tell where bad links
#                 came from; if you do include locations, the output will be
#                 larger, and running the output though sort and uniq won't be
#                 as useful for detecting inconsistent link text.
#                    By default, source locations are not included in the
#                 output. A value of loc=1 causes locations to be included.
#                 A value of loc=hide (or anything beginning with the three
#                 characters "hid...") will include locations as HTML comments,
#                 if the title= option has alson been specified.
#
# Copyright 1994, 1995 by H. Churchyard, churchh@uts.cc.utexas.edu -- freely
# redistributable.
#
#  Version 1.0 12/15/94
#  Version 1.1 12/18/94 -- improve HTML-icity of "title=" option output, etc.
#  Version 1.11 12/19/94 -- squashed minor bugs.   Was informally made
# available by HTTP from uts.cc.utexas.edu.
#  Version 1.2 1/9/95 -- Added loc= option, include <IMG ALT="..."> text in
# links.  Included in htmlchek 4.0 release.
#
eval "exec /usr/local/bin/perl -S $0 $*"
    if $running_under_some_shell; # This emulates #! processing on NIH machines
#
# Setup:
#
$known{'A'} = 1; $known{'IMG'} = 1; $known{'TITLE'} = 1; $known{'/A'} = 1;
$known{'/TITLE'} = 1; $known{'BASE'} = 1; $pair{'A'} = 1;  $pair{'TITLE'} = 1;
#
&initscalrs();
$usebase = 0; $dirprefix = ''; $sugar = 0; $title = ''; $loc = 0;
#process any FOO=bar switches
eval '$'.$1.'$2;' while $ARGV[0] =~ /^(usebase=|dirprefix=|sugar=|title=|loc=)(.*)/ && shift;
$[ = 1;                 # set array base to 1
$, = ' ';               # set output field separator
$\ = "\n";              # set output record separator
foreach $X (@ARGV) {
    if ($X =~ /^[^=]+=/) {
        print STDERR "Apparent misspelled or badly-placed command-line option $&";
        print STDERR "Attempting to continue anyway...";}}
#
if ($title) {
    print " <html><head><title>$title</title></head><body><h1>$title</h1>";
    $E = '<br>'; $gt = '&gt;'; $lt = '&lt;'; $A = '<!-- '; $Z = ' -->';
    if ($loc =~ /^HID/i)
      {$AA = '<!-- '; $ZZ = ' -->';}
    else
      {$AA = ''; $ZZ = '';}}
else {
    $E = ''; $gt = '>'; $lt = '<'; $A = ''; $Z = ''; $AA = ''; $ZZ = '';}
#
# Main
#
$stuperlRS = $/;
while (<>) {
    if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
        chop;}                  # be unterminated.
    if (($.-$FNRbase) == 1) {
        $fn = $ARGV;
        # Next line is Unix-specific
        $fn =~ s/^\.\///;
        $nampref = ($dirprefix . $fn . '#');
        $lochpref = ($dirprefix . $fn);
        if ($fn =~ /.\//) {
            $fromroot = $fn; $fromroot =~ s/\/[^\057]*$/\//;}
        else {
            $fromroot = '';}
        $fromroot=($dirprefix . $fromroot);}
    if ($sugar) {$S = ($fn . ': ' . ($.-$FNRbase) . ': ');}
    if ($loc) {$L = ($fn . ' ' . ($.-$FNRbase));}
    $lastbeg = 0; $currsrch = 1; $txtbeg = 1;
    while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
      ($RSTART = length($`)+1)) != 0) {
        $currsrch = ($currsrch + $RSTART);
        if (substr($_, ($currsrch - 1), 1) eq '<') {
            if ($state) {
                print $A . $S . "Multiple `$lt' without `$gt' ERROR!", &crl() .
                  $Z;}
            else {
                if (($currsrch > length($_)) ||
                  (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
                    print $A . $S .
                      "Whitespace after `$lt': Incorrect SGML syntax ERROR!",
                      &crl() . ", Ignoring$Z";}
                else {
                    if (($nestvar) && ($currsrch > ($txtbeg + 1))) {
                        $line = ($line . substr($_, $txtbeg,
                          ($currsrch - ($txtbeg + 1))));}
                    $lastbeg = $currsrch; $state = 1;
                    $lasttag = ''; $lastopt = '';}}}
        else {
            if (substr($_, ($currsrch - 1), 1) eq '>') {
                if ($state == 0) {
                    next;}        #`>' without `<'
                else {
                    &parsetag($currsrch - 1);
                    if (($inquote) || ($inequal)) {
                        &malft();}
                    if ($optfree) {
                        &misstest();}
                    if (($lasttag eq 'A') && (!$wasname) && (!$washref)) {
                        print $A . $S . $lt .
                          "A$gt tag occurred without reference (NAME,HREF,ID) option ERROR!",
                          &crl() . $Z;}
                    if (($wasname > 1) || ($washref > 1)) {
                        print $A . $S .
                          'Multiple reference (NAME,ID;HREF) options ERROR!',
                          &crl(), 'on tag', $lasttag . $Z;}
                    $txtbeg = $currsrch;
                    $state = 0; $continuation = 0;}}
            else {
                print $A . $S . 'Internal error', &crl(), 'ignore' . $Z;}}}
    if (($state == 1) || (($lastbeg == 0) && ($continuation == 1))) {
        &parsetag(length($_) + 1);
        $continuation = 1;}
    else {
        if (($nestvar) && (!$state) && ($txtbeg <= length($_))) {
          $line = ($line . substr($_, $txtbeg) . ' ');}
        else {
          $line = ($line . ' ');}}}
continue {
    $FNRbase = $. if eof;}
#
# End-of-file routine.
#
if ($. > 0) {&endit()};
if ($title) {print '<hr></body></html>';}
#
#
# parsetag() communicates with main() through these global variables:
# - $lastbeg (zero if no `<' ocurred on line, otherwise points to character
#   immediately after the last `<' encountered).
# - $state (one if unresolved `<', zero otherwise).
# - $continuation (one if unresolved `<' from previous line, zero otherwise),
# - $inquote (one if inside option quotes <tag opt="...">).
#
sub parsetag {
    local($inp) = @_;
    if (!$lastbeg) {
        $lastbeg = 1;}
    $numf = (@arr = split(' ', substr($_, $lastbeg, ($inp - $lastbeg))));
    if ($numf == 0) {
        if (!$continuation) {
            print $A . $S . "Blank $lt$gt ERROR!", &crl() . $Z;
            $state = 0;}
        return;}
    else {
        if (!$continuation) {
            $arr[1] =~ tr/a-z/A-Z/;
            $lasttag = $arr[1];
            if (defined $known{$arr[1]}) {
                if ($arr[1] =~ /^\//) {
                    # </TAG> found
                    $arr[1] =~ s/^\///;
                    if (defined $pair{$arr[1]}) {
                            if (($nestvar <= 0) || ($lev{$arr[1]} <= 0)) {
                                print $A . $S . 'Extraneous /' . $arr[1],
                                  'tag without preceding', $arr[1], 'tag ERROR!',
                                  &crl() . ', Ignoring' . $Z;}
                            else {--$nestvar; --$lev{$arr[1]};
                                  if ($arr[1] eq 'TITLE') {&doout($lochpref);}
                                  else
                                      {if ($currf[2]) {&doout($currf[2]);}
                                       if ($currf[3]) {&doout($currf[3]);}}}}}
                else {
                    # <TAG> found
                    if ($arr[1] ne 'IMG') {$line = '';}
                    ++$lev{$arr[1]};
                    if (defined $pair{$arr[1]}) {
                        $currf[2] = ''; $currf[3] = '';
                        ++$nestvar;
                        if (($lev{$arr[1]} > 1) || ($nestvar > 1)) {
                            print $A . $S . 'Nesting ERROR!', &crl(),
                            "on tag $arr[1]" . $Z;}}}}
                $startf = 2; $inquote = 0; $inequal = 0; $optfree = 0;
                $wasopt = 0; $wasname = 0; $washref = 0;}
        else {
            $startf = 1;}
        # Remainder of stuff in <...> after tag word
        if (defined $known{$lasttag}) {
            for ($i = $startf; $i <= $numf; ++$i) {
                if ((!$inequal) && (!$inquote)) {
                    if (($arr[$i] =~
                      /^[^=\042]*(=\042[^\042]*\042)?$/) ||
                      ($arr[$i] =~ /^[^=\042]*=(\042)?[^\042]*$/)) {
                        if (($optfree) &&
                          (($arr[$i] =~ /^=[^=\042][^=\042]*$/) ||
                          ($arr[$i] =~ /^=\042[^\042]*\042$/))) {
                            if (!$malftag) {
                                $arr[$i] =~ s/^\075//;
                                if ($arr[$i] =~ /\042/) {
                                    &optvalproc($arr[$i],1);}
                                else {&optvalproc($arr[$i],0);}}
                            $optfree = 0; ++$tagwarn;}
                        else {
                            if (($optfree) && (($arr[$i] =~ /^=\042/) ||
                              ($arr[$i] eq '='))) {
                                $inequal = 1; ++$tagwarn;}
                            @arr2 = split(/=/, $arr[$i], 2);
                            if ($arr2[1] eq '') {
                                if (!$inequal) {
                                    print $A . $S . 'Null tag option ERROR!',
                                      &crl(), "on tag $lasttag" . $Z;
                                $malftag = 1;}}
                            else {
                                if ($optfree) {
                                    &misstest();}
                                $arr2[1] =~ tr/a-z/A-Z/;
                                $optfree = 1; ++$wasopt;
                                $malftag = 0; $optvalstr = '';
                                if ($lasttag =~ /^\//) {
                                    print $A . $S . 'Option on closing tag',
                                    $lasttag, 'Warning!', &crl() . $Z;}
                                else {
                                    $lastopt = $arr2[1];}}
                            if ($arr[$i] =~ /^[^=\042][^=\042]*=$/) {
                                $inequal = 1;}
                            if ($arr[$i] =~ /[\075]/) {
                                $optvalstr = $arr[$i];
                                $optvalstr =~ s/^[^=]*=//;}
                            $stuperltmp = $arr[$i];
                            $Q = ($stuperltmp =~ s/\042//g);
                            if ($Q == 1) {
                                $inquote = 1;}
                            if (($optvalstr)&&(!$inequal)&&(!$inquote)) {
                                $optfree = 0;
                                if (!$malftag) {
                                    &optvalproc($optvalstr,$Q);}}}}
                    else {
                        &malft();}}
                else {
                    if (($inequal) && (!$inquote)) {
                        ++$tagwarn;
                        if ($arr[$i] =~ /\042/) {
                            if ($arr[$i] =~ /^\042[^\042]*(\042)?$/) {
                                $stuperltmp = $arr[$i];
                                if (($stuperltmp =~ s/\042//g) == 2) {
                                    if (!$malftag) {
                                        $stuperltmp =~ s/^\075//;
                                        &optvalproc($stuperltmp,1);}
                                    $inequal = 0; $optfree = 0;}
                                else {
                                    $optvalstr = $arr[$i];
                                    $inquote = 1;}}
                            else {
                                &malft();}}
                        else {
                            if ($arr[$i] !~ /[\075]/) {
                                if (!$malftag) {
                                    &optvalproc($arr[$i],0);}
                                $inequal = 0; $optfree = 0;}
                            else {
                                &malft();}}}
                    else {
                        if ($arr[$i] =~ /\042/) {
                            $inquote = 0; $inequal = 0; $optfree = 0;
                            if ($arr[$i] !~ /^[^\042]*\042$/) {
                                &malft();}
                            else {
                                $optvalstr = ($optvalstr . ' ' . $arr[$i]);
                                if (!$malftag) {
                                  &optvalproc($optvalstr,1);}}}
                        else {
                            $optvalstr = ($optvalstr . ' ' . $arr[$i]);}}}}}
        return;}}
#
#
# Return as much location information as possible in diagnostics:
#
# Current location:
sub crl {
    if (($fn)&&($fn ne '-')) {
        return ('at line ' . ($.-$FNRbase) . " of file \042" . $fn . "\042");}
    else {
        return ('at line ' . $.);}}
#
# End of file location:
sub ndl {
    if (($fn)&&($fn ne '-')) {
        return ("at END of file \042" . $fn . "\042");}
    else {
        return 'at END';}}
#
# Error message returned from numerous places in the program...
#
sub malft {
    print $A . $S . 'Malformed tag option ERROR!', &crl(), 'on tag', $lasttag .
      $Z;
    $malftag = 1;}
#
#
#Check for non-kosher null options:
#
sub misstest {
    if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'HREF') ||
      ($lastopt eq 'ID')) {
        print $A . $S . 'Missing reference option value', &crl(),
          "on tag $lasttag, option $lastopt" . $Z;}}
#
#
sub doout {
    local($href) = @_;
    $line =~ s/[ \t][ \t]+/ /g; $line =~ s/\t/ /g;
    $line =~ s/^ //; $line =~ s/ $//;
    if ($line eq '') {
      $line = '[ EMPTY ANCHOR TEXT ]';}
    print "<A HREF=\042" . $href . "\042>" . $line . '</A>', $AA . $L . $ZZ . $E;}
#
# This subroutine receives the raw option value string, for every tag option
# that does have a value.  It does some errorchecking and cleanup, and sets
# the URL or name of the current anchor.
#
sub optvalproc {
    local($val, $quoted) = @_;
    $currfn = 0;
    if ($quoted) {
        $val =~ s/\042//g; $val =~ s/^ //; $val =~ s/ $//;}
    if ($lasttag eq 'IMG') {
        if (($lastopt eq 'ALT') && ($val =~ /[^ \t]/)) {
            $line = ($line . " [ $val ] ");}}
    elsif ($lasttag eq 'BASE') {
        if (($usebase) && ($lastopt eq 'HREF')) {
            if (($quoted) && ($val) && ($val ne '=') && ($val !~ /[^ ] [^ ]/)) {
                $nampref = ($val . '#'); $lochpref = $val;
                if ($val =~ /.\//) {
                    $fromroot = $val;
                    $fromroot =~ s/\/[^\057]*$/\//;}
                else {
                    $fromroot = '';}}
            else {
               print $A . $S . "Bad $lt" . "BASE HREF=\042...\042$gt", &crl() .
                 ', Ignoring' . $Z;}}}
    else {
        if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'ID')) {
            $currfn = 2; ++$wasname;
            if ($val =~ /^#/) {
                print $A . $S . "Invalid #-initial location \042" .
                  $val . "\042 ERROR!", &crl(), 'on tag', $lasttag,
                  'option', $lastopt . $Z;}}
        else {
            if ($lastopt eq 'HREF') {
                $currfn = 3; ++$washref;}}}
    if ($currfn) {
        if (!$quoted) {
            print $A . $S . 'Unquoted reference option value Warning!', &crl(),
              "on tag $lasttag, option $lastopt$Z";}
        if ($val =~ /[^ ] [^ ]/) {
            print $A . $S . 'Whitespace in reference option value Warning!',
              &crl(), "on tag $lasttag, option $lastopt$Z";}
        else {
            if ($val eq '') {
                print $A . $S . 'Null reference option value ERROR!', &crl(),
                  "on tag $lasttag, option $lastopt$Z";}
            else {
                # Skip the residue of Malformed Tag Option cases;  OK to do
                # this, since "=" is not a valid URL;  However, a minor bug
                # is that <A NAME="="> will not be checked, and will not
                # result in any errormessage.
                if ($val ne '=') {
                    if ($currfn == 2) {
                        $val = ($nampref . $val);}
                    else {
                        if (($currfn == 3) && ($val =~ /^#/)) {
                            $val = ($lochpref . $val);}
                        else {
                            if ($val =~ /^http:[^\057]*$/) {
                                $val =~ s/^http://;}
                            if (($val !~ /^[^\057]*:/) && ($val !~ /^\//)) {
                                if ($val =~ /^~/) {
                                    print $A . $S .
                                      "Relative URL beginning with '~' Warning!",
                                      &crl(),"on tag $lasttag option $lastopt$Z";}
                                else {
                                    $val = ($fromroot . $val);}}}}
                    # This monstrosity supports "../" in URL's:
                    while ($val =~ /\057[^\057]*[^\057]\057\.\.\057/) {
                        $val =~ s/\057[^\057]*[^\057]\057\.\.\057/\057/;}
                    if (($val =~ /[:\057]\.\.\057/) || ($val =~ /^\.\.\057/)) {
                        print $A . $S . "Unresolved \042../\042 in URL Warning!",
                          &crl(), "on tag $lasttag option $lastopt$Z";}
                    $currf[$currfn] = $val;}}}}}
#
#
# Start each file with a clean slate.
#
sub initscalrs {
    $state = 0; $continuation = 0; $nestvar = 0; $S = ''; $L = ''; $line = '';}
#
#
#
sub endit {
    if ($sugar) {$S = ($fn . ': END: ');}
    if ($continuation) {
        print $A . $S . "Was awaiting a `$gt' ERROR!", &ndl() . $Z;}
    foreach $X (sort(keys %pair)) {
        if ($lev{$X} > 0) {
            print $A . $S . "Pending unresolved $lt" .
              "x$gt without $lt/x$gt ERROR!", &ndl(), 'on tag', $X . $Z;}}
    #Reinitialize for next file
    &initscalrs();
    undef %lev;}
#-=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-  -=-
##EOF
