#!/usr/local/bin/perl
#dehtml.pl: Removes all HTML tags from file, preliminary to spell check; common
#           ampersand "&entities;" are also resolved into single characters.
#
# Typical use:
#
#   perl dehtml.pl infile.html > outfile.txt
#
# This program processes all files on the command line to STDOUT; to process a
# number of files individually, use the iteration mechanism of your shell; for
# example:
#
#    for a in *.html ; do perl dehtml.pl $a > otherdir/$a ; done
#
# in Unix sh, or:
#
#    for %a in (*.htm) do call dehtml %a otherdir\%a
#
# in MS-DOS, where dehtml.bat is the following one-line batch file:
#
#    perl dehtml.pl %1 > %2
#
# Copyright H. Churchyard 1994 -- freely redistributable.
#
#  Version 1.0 11/27/94 -- Tested with 4.03[56] on SunOS and DEC Alpha OSF/1,
# and MacPerl 4.13.  Included in htmlchek 3.0 release.
#  Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
# string such as "&amp;eacute;" to be reduced into a single character;
# added "&shy;".   Included in htmlchek 3.01 release.
#  Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix.  Included
# in htmlchek 4.0 release.
#
#   This program is a port to perl of the original dehtml.awk (the port was
# fairly mechanical, so programming style and efficency may not be high).
#
eval "exec /usr/local/bin/perl -S $0 $*"
    if $running_under_some_shell;
                        # this emulates #! processing on NIH machines.
$[ = 1;                 # set array base to 1
$, = ' ';               # set output field separator
$\ = "\n";              # set output record separator
#
$amp{'&#32;'} = "\040"; $amp{'&nbsp;'}="\040";
$amp{'&#34;'} = "\042"; $amp{'&quot;'}="\042";
$amp{'&#60;'} = "\074"; $amp{'&lt;'}="\074"; $amp{'&#62;'} = "\076";
$amp{'&gt;'}="\076"; $amp{'&Agrave;'}="\300"; $amp{'&Aacute;'}="\301";
$amp{'&Acirc;'}="\302"; $amp{'&Atilde;'}="\303"; $amp{'&Auml;'}="\304";
$amp{'&Aring;'}="\305"; $amp{'&AElig;'}="\306"; $amp{'&Ccedil;'}="\307";
$amp{'&Egrave;'}="\310"; $amp{'&Eacute;'}="\311"; $amp{'&Ecirc;'}="\312";
$amp{'&Euml;'}="\313"; $amp{'&Igrave;'}="\314"; $amp{'&Iacute;'}="\315";
$amp{'&Icirc;'}="\316"; $amp{'&Iuml;'}="\317"; $amp{'&ETH;'}="\320";
$amp{'&Ntilde;'}="\321"; $amp{'&Ograve;'}="\322"; $amp{'&Oacute;'}="\323";
$amp{'&Ocirc;'}="\324"; $amp{'&Otilde;'}="\325"; $amp{'&Ouml;'}="\326";
$amp{'&Oslash;'}="\330"; $amp{'&Ugrave;'}="\331"; $amp{'&Uacute;'}="\332";
$amp{'&Ucirc;'}="\333"; $amp{'&Uuml;'}="\334"; $amp{'&Yacute;'}="\335";
$amp{'&THORN;'}="\336"; $amp{'&szlig;'}="\337"; $amp{'&agrave;'}="\340";
$amp{'&aacute;'}="\341"; $amp{'&acirc;'}="\342"; $amp{'&atilde;'}="\343";
$amp{'&auml;'}="\344"; $amp{'&aring;'}="\345"; $amp{'&aelig;'}="\346";
$amp{'&ccedil;'}="\347"; $amp{'&egrave;'}="\350"; $amp{'&eacute;'}="\351";
$amp{'&ecirc;'}="\352"; $amp{'&euml;'}="\353"; $amp{'&igrave;'}="\354";
$amp{'&iacute;'}="\355"; $amp{'&icirc;'}="\356"; $amp{'&iuml;'}="\357";
$amp{'&eth;'}="\360"; $amp{'&ntilde;'}="\361"; $amp{'&ograve;'}="\362";
$amp{'&oacute;'}="\363"; $amp{'&ocirc;'}="\364"; $amp{'&otilde;'}="\365";
$amp{'&ouml;'}="\366"; $amp{'&oslash;'}="\370"; $amp{'&ugrave;'}="\371";
$amp{'&uacute;'}="\372"; $amp{'&ucirc;'}="\373"; $amp{'&uuml;'}="\374";
$amp{'&yacute;'}="\375"; $amp{'&thorn;'}="\376"; $amp{'&yuml;'}="\377";
$amp{'&reg;'}="\256"; $amp{'&copy;'}="\251"; $amp{'&#163;'} = "\243";
$amp{'&shy;'}="-";
#
# Main
#
# Variable ``$state'' is one if unresolved `<', zero otherwise.
#
$stuperlRS = $/;
while (<>) {
    if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
        chop;}                  # be unterminated.
    $line = ''; $errstr = ''; $erra = 0; $errb = 0;
    $currsrch = 1; $txtbeg = 1;
    while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
      ($RSTART = length($`)+1)) != 0) {
        $currsrch = ($currsrch + $RSTART);
        if (substr($_, ($currsrch - 1), 1) eq '<') {
            if ($state) {
                if (!$erra) {
                    $errstr = ($errstr .
                      "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
                    $erra = 1;}}
            else {
                if (($currsrch > length($_)) ||
                  (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
                    if (!$errb) {
                        $errstr = ($errstr .
                          "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
                        $errb = 1;}}
                else {
                    if ($currsrch > ($txtbeg + 1)) {
                        $line = ($line . substr($_, $txtbeg,
                          ($currsrch - ($txtbeg + 1))));}
                    $state = 1;}}}
        else {
            if (substr($_, ($currsrch - 1), 1) eq '>') {
                if ($state == 0) {
                    next;}
                else {$txtbeg = $currsrch; $state = 0;}}
            else {print 'Internal error, ignore';}}}
#At EOL:
    if ((!$state) && ($txtbeg <= length($_))) {
        $line = ($line . substr($_, $txtbeg));}
    if ($line =~ /&#?[-0-9a-zA-Z.]*;/) {
        foreach $X (keys %amp) {
            $s_ = $amp{$X}; $line =~ s/$X/$s_/g;
            if ($line !~ /&/) {
                last;}}
        $line =~ s/&(#38|amp);/&/g;}
    if (($line) || ((!$state) && ($_ =~ /^$/))) {
        if ((!$state) || ($errstr) || ($line =~ /[ \t]$/))
            {print $line;}
        else {printf "%s", $line;}}
    if ($errstr) {
        printf '%s', $errstr;}}
#
#Minor bug: &g<X>t; will translate to a `>' character!
#
#END routine:
#
if ($state) {
    print "&&^Was awaiting a `>' ERROR! at END^&&";}
##EOF
