Alwanza: Perl Script - HTML to Lower Case

#!C:/Perl/bin

use strict;

# script created and provided by Meryll Larkin 6/25/02
# last update: 6/29/02
# please give credit where credit is due
# please report bugs to Meryll Larkin at alwanza@yahoo.com

# This script converts all HTML tags & property names to lower case
# changes <br> to <br />, <p> to <br /><br /> and eliminates </p>

# Known failings: right now this script does not preserve white space
# such as new lines, tabs or multiple spaces within HTML tags.
# Instead, it converts them all to single spaces.

# ***** Modifyable constants: ***********
my $maxFilelength = 100000; # Adjust if </html> tag is not present on generated file
my $commentPreserve = 1;    # 0 = will lower case all comments
# WARNING: setting commentPreserve to 0 will also lower case embedded stylesheet tags
# ***** End Modifyable constants ***********

my @inputfiles;
my $inputfile;
my $outfile;

if (! @ARGV) {
   my $name = ucfirst($ENV{LOGNAME});
   print "\nHello $name,\n\n";
   print "This script will convert all HTML tags and tag attribute names to lower case.\n";
   print "You will be asked to provide the name of the HTML file for processing.\n";
   print "Do you wish to continue? (y or n): ";

   &query_continue;
   $inputfile = &get_filename;
   push (@inputfiles, $inputfile);
} else {
   @inputfiles = @ARGV;
}

for $inputfile (@inputfiles) {
   $outfile = &open_infile_make_outfile($inputfile);
   # read entire file into $html
   my $html;
   my $bytes_read = read(INPUT, $html, $maxFilelength);
   # if there are scripts in Web page, preserve them as is.
   # Normalize script tags for processing
   $html =~ s/<\s*SCRIPT/<script/ig; # convert all script tags to lower case
   $html =~ s/<\s*\/SCRIPT/<\/script/ig;

   my @chunk_1 = &break_html_into_script_chunks($html, $bytes_read);
   # we now have 2 types of chunks, script chunks and html chunks
   # the html chunks need to divided into comment and noncomment chunks
   # this is because comments can contain html tags and searching for
   # opening and closing angle brackets gets difficult when they
   # are embedded in comments
   my $chunk;
   my $before;
   my $comment;
   my $after;
   my @chunk_2;
   my @chunkettes;
   foreach $chunk(@chunk_1) { # further divide chunks containing comments
      if ($chunk =~ /^\s*<script/ ) {
         push (@chunk_2, $chunk);
         next;
      }
      # Normalize comment tags for processing
      $chunk =~ s/<\s*!\s*-*\s//g;
      if (($chunk !~ /)/;
            push (@chunkettes, $comment);
            $chunk = substr($chunk, length($comment), (length($chunk) - length($comment)));
            $_ = $chunk;
            ($after) = /^([\s\S]*?)<!--/;
            # we should get an "after" unless there are no more comments OR
            # if the next comment is immediately after this one.
            if ($after =~ /[\w\W]/) {
               $chunk = substr($chunk, length($after), (length($chunk) - length($after)));
               push (@chunkettes, $after);
            } else {
               next if $chunk =~ /\s*<!--/;
               $_ = $chunk;
               ($after) = /^([\s\S]*)$/;
               $chunk = '';
               push (@chunkettes, $after);
            }
            $after = $comment = '';
         } # close while ($chunk =~ /<!--/)
         foreach $chunk(@chunkettes) {
            push (@chunk_2, $chunk);
         }
         @chunkettes = ''; # reinitialize @chunkettes
      }# close if ($chunk =~ /<!--/)
   } #close foreach $chunk(@chunk_1)

   my @sections;
   my $section;
   my $tagFlag = 1; # indicator for "<"
   foreach $chunk(@chunk_2) {
      if ($chunk =~ /^<script/ ) {   # chunk consists of a script
         my $scriptOpen = $chunk;
         $scriptOpen =~ /^<script[^>]>/;
         $scriptOpen =~ s/language/language/i;
         $scriptOpen =~ s/type/type/i;
         $chunk =~ s/^<script[^>]>/$scriptOpen/;
         $chunk .= "\n";
         &print_to_outfile($chunk);
         next;
      }
      elsif ($chunk =~ /^<!--/ ) {   # chunk consists of a comment
         $chunk = lc($chunk) if $commentPreserve == 0;
         &print_to_outfile($chunk);
         next;
      } else {   # chunk is HTML and text, just text, just html, or text and HTML
         # begin modification of HTML
         if ($chunk !~ />/) { # this chunk contains only text
            &print_to_outfile($chunk);
            next;
         }
         if ($chunk !~ /^\s*</) { # this chunk begins with text
            $tagFlag = 0; # needed for first section of chunk only
         }
         @sections = split("<", $chunk);
         my $text;
         my $tag;
         foreach $section(@sections) {
            next if $section !~ /\w/;
            $section = "<" . $section if $tagFlag; # restore < to section
            ($text, $tag) = split("<", $section) if !$tagFlag;
            ($tag, $text) = split(">", $section) if $tagFlag;
            if ($tag =~ /\w/) {
               $tag = $tag . ">" if $tag !~ />\s*$/; # restore > to tag
               $tag = $tag . "<" if $tag !~ /^\s*</; # restore > to tag
            }
            # find out if tag contains properties
            if ($tag =~ /=/) {
               # tag does contain properties
               $section = &process_tag_with_properties($tag, $text, $tagFlag);
               &print_to_outfile($section);
            } else { # tag does not contain properties
               $tag = lc($tag);
               $tag=~ s/<br>/<br \/>/;
               $tag =~ s/<p>/<br \/><br \/>/;
               $tag =~ s/<\/p>//;
               $section = $tag . $text if $tagFlag;
               $section = $text . $tag if !$tagFlag;
               &print_to_outfile($section);
            }
            $tagFlag = 1; # reinitialize for next section
            $text = $tag = ''; # reinitialize for next section
         } # close foreach section
      } # close if chunk does not contain script
   } # close foreach chunk
   close INPUT;
   close OUTPUT;
   print "\n   $inputfile processed. \n   View new formatting in $outfile.\n";
}
print "\n   Job complete.\n";

exit 0;

#*******************
sub query_continue {
    my $continue = substr(<STDIN>, 0, 1);    #take first character only
    $continue =~ tr/a-z/A-Z/;    #transform to upper case
    if (( $continue ne "Y" ) && ( $continue ne "N" )) {
       print "$continue is not a valid answer.\n";
       $continue = "";
       print "Do you wish to continue? (y or n)\n";
       &query_continue;
    }
    if ( $continue eq "N" ) {
       print "Quitting per user request.\n";
       exit (0);
    }
}

#*******************
sub get_filename {
    print "\nWrite the name of the html file that needs its tags changed \n" .
          "to lower case: ";
    chomp(my $inputfile = <STDIN>);    #remove "enter"
    if ( $inputfile !~ /\w+\.htm/ ) {
       print "$inputfile is not a valid file name.\n";
       print "Do you wish to continue? (y or n)\n";
       &query_continue;
    }
    return $inputfile;
}

#*******************
sub open_infile_make_outfile {
    my $inputfile = shift;
    unless (open( INPUT, $inputfile )){
       print "Can't open $inputfile. Do you wish to rewrite? (y or n)\n";
       &query_continue;
       $inputfile = &get_filename;
       &open_infile_make_outfile($inputfile);
    }
    my ($name, $extension) = split ('.ht', $inputfile);
    $name = $name . "_x";
    my $outfile = $name . ".ht" . $extension;
    if ( -s $outfile ) {
       print "\n$outfile exists.\nPlease delete or rename file and then rerun script.\n";
       exit (0);
    } else {
       open( OUTPUT, ">>$outfile" ) or die "Can\'t open $outfile $!.";
       return $outfile;
   }
}

#*******************

sub break_html_into_script_chunks {
   my ($html, $bytes_read)= @_;
   my $afterScriptPosition = 0;   # initialize for while loop
   my $remainingHTML = $html;   # initialize for while loop
   my $scriptClosePosition = 0; # initialize for while loop
   my $scriptOpenPosition = 0; # initialize for while loop
   # chunks are strings that either contain no <script> or
   # consist of one entire block of <script> to </script>
   my @chunk_1;

   while ($scriptOpenPosition != -1) {
      $scriptOpenPosition = index($html, "<script", 0);
      $scriptClosePosition = index($html, "<\/script>", 0);
      if ($scriptOpenPosition > -1) {
          my $scriptLength = $scriptClosePosition - $scriptOpenPosition + 9;
          my $scriptChunk = substr($html, $scriptOpenPosition, $scriptLength);
          my $HTMLChunk = substr($html, 0, $scriptOpenPosition);

          # separate file into script portions and html portions
          push(@chunk_1, $HTMLChunk) unless $HTMLChunk !~ /\w/;
          push(@chunk_1, $scriptChunk) unless $scriptChunk !~ /\w/;
          $afterScriptPosition = $scriptClosePosition +9;
          $remainingHTML = substr($html, $afterScriptPosition, (length($html) - $afterScriptPosition));
          $html = $remainingHTML;
      } else {
         # if we get here, there are no more <script> tags
         push(@chunk_1, $remainingHTML) unless $remainingHTML !~ /\w/;
      }
   }
   return @chunk_1;
}

#*******************
sub print_to_outfile {
   my $section = shift;
   print OUTPUT "$section";
}

#*******************
sub process_tag_with_properties {
   my ($tag, $text, $tagFlag) = @_;
   # there are several possible configurations for property-value pairs
   # a=b | a="b" | a='b' | a="b, c, d" | a='b, c, d' | a = "b"
   # difficult example <font size = "+1" color=red face='Arial, lucida,
   #    Geneva'>
   # another example - note quotes: onmouseout="imgOff('img01')"
   # flow logic: we don't know how to identify the end of each
   # property-value pair, so until we can, all we know is that it
   # will be positioned before the next = or >
   # in addition there could be stand alone words such as NOWRAP or NOSHADE
   # that aren't the name of the tag
   $tag =~ s/=\s+/=/g; #remove any white space immediately after =
   $tag =~ s/\s+=/=/g; #remove any white space immediately before =
   my $startPosition = 0;   # initialize for until loop
   my $tagEnd = 0; # initialize for until loop
   my $endPair; # initialize for until loop
   my $property;
   my $value;
   my $pair;
   my @goodPairs;
   until ($tagEnd==1) {
      my $eqPosition = index($tag, "=", 0);
      # look for stand-alone characters which are either tag identifiers
      # or properties with no equal signs
      my $possStandAlone = substr($tag, $eqPosition, 0);
      if ($possStandAlone =~ /(\w+)\s+\w*/ ) {
         my $fixed1 = lc($1);
         $pair = $fixed1;
         $endPair = index($tag, " ", 0);
         goto PUSHPAIR;
      }
      my $nextEqPosition = index($tag, "=", ($eqPosition +2));
      my $tagEndPosition = index($tag, ">");
      if ($nextEqPosition < 0 ) {
         # there are no more equals in tag, make ">" the nextEqPosition
         $nextEqPosition = $tagEndPosition;
         $tagEnd = 1;
      }
      # test are there quotes between this eq position and before next one?
      my $dubQuotPosition = index($tag, '"', $eqPosition);
      my $singQuotPosition = index($tag, "'", $eqPosition);
      # remember example: onmouseout="imgOff('img01')"
      if (($dubQuotPosition > -1) && ($singQuotPosition > -1 )) {
         # we have both double and single quotes
         if ($dubQuotPosition > $singQuotPosition) {
            # the value is delimited by single quotes
            $endPair = index($tag, "'", ($singQuotPosition+1));
         } else {
            # the value is delimited by double quotes
            $endPair = index($tag, '"', ($dubQuotPosition+1));
         }
      }
      elsif (($singQuotPosition > $eqPosition) && ($singQuotPosition < $nextEqPosition)) {
         # the value is delimited by single quotes
         $endPair = index($tag, "'", ($singQuotPosition+1));
      }
      elsif (($dubQuotPosition > $eqPosition) && ($dubQuotPosition < $nextEqPosition)) {
         # the value is delimited by double quotes
         $endPair = index($tag, '"', ($dubQuotPosition+1));
      } else {
         # there are no quotes in and no spaces after value string
         # the first space after the = sign is the end of the pair
         # or the end of the tag, if last property
         $endPair = index($tag, " ", $eqPosition);
         my $closeTag = index($tag, ">", $eqPosition);
         if ($endPair < 0) {
            $endPair = $closeTag;
         }
      }
      # at this point we have isolated a property or property-value pair
      $value = substr($tag, $eqPosition, ($endPair-$eqPosition + 1));
      $property = substr($tag, 0, $eqPosition);
      $property = lc($property);
      # gather all good data into reassembed tag here
      $pair = $property . $value;
PUSHPAIR:
      $pair =~ s/^\s*//; # strip off leading and trailing spaces
      $pair =~ s/\s*$//;
      push(@goodPairs, $pair);
      $startPosition = $endPair + 1;
      $tag = substr($tag, $startPosition, $tagEndPosition);
      $property = $value = $pair = $endPair = 0;
   } # end until ($tagEnd)
   $tagEnd = 0;
   $tag = join(" ", @goodPairs);
   # reinitialize goodPairs
   @goodPairs = "";
   # restore > to the end of tag
   $tag = $tag . ">" if $tag !~ />$/;
   $tag = "<" .$tag if $tag !~ /^</;
   my $section;
   if ($text) {
      $section = $tag . $text if $tagFlag;
      $section = $text . $tag if !$tagFlag;
   } else {
      $section = $tag;
   }
   return $section;
}

#*********** END *********