#!C:/Perl/bin
use strict;
# script created and provided by Meryll Larkin 6/25/02
# last update: 6/29/02
# please give credit where credit is due
# please report bugs to Meryll Larkin at alwanza@yahoo.com
# This script converts all HTML tags & property names to lower case
# changes <br> to <br />, <p> to <br /><br /> and eliminates </p>
# Known failings: right now this script does not preserve white space
# such as new lines, tabs or multiple spaces within HTML tags.
# Instead, it converts them all to single spaces.
# ***** Modifyable constants: ***********
my $maxFilelength = 100000; # Adjust if </html> tag is not present on generated file
my $commentPreserve = 1; # 0 = will lower case all comments
# WARNING: setting commentPreserve to 0 will also lower case embedded stylesheet tags
# ***** End Modifyable constants ***********
my @inputfiles;
my $inputfile;
my $outfile;
if (! @ARGV) {
my $name = ucfirst($ENV{LOGNAME});
print "\nHello $name,\n\n";
print "This script will convert all HTML tags and tag attribute names to lower case.\n";
print "You will be asked to provide the name of the HTML file for processing.\n";
print "Do you wish to continue? (y or n): ";
&query_continue;
$inputfile = &get_filename;
push (@inputfiles, $inputfile);
} else {
@inputfiles = @ARGV;
}
for $inputfile (@inputfiles) {
$outfile = &open_infile_make_outfile($inputfile);
# read entire file into $html
my $html;
my $bytes_read = read(INPUT, $html, $maxFilelength);
# if there are scripts in Web page, preserve them as is.
# Normalize script tags for processing
$html =~ s/<\s*SCRIPT/<script/ig; # convert all script tags to lower case
$html =~ s/<\s*\/SCRIPT/<\/script/ig;
my @chunk_1 = &break_html_into_script_chunks($html, $bytes_read);
# we now have 2 types of chunks, script chunks and html chunks
# the html chunks need to divided into comment and noncomment chunks
# this is because comments can contain html tags and searching for
# opening and closing angle brackets gets difficult when they
# are embedded in comments
my $chunk;
my $before;
my $comment;
my $after;
my @chunk_2;
my @chunkettes;
foreach $chunk(@chunk_1) { # further divide chunks containing comments
if ($chunk =~ /^\s*<script/ ) {
push (@chunk_2, $chunk);
next;
}
# Normalize comment tags for processing
$chunk =~ s/<\s*!\s*-*\s/<!-- /g;
$chunk =~ s/\s*-*-\s*>/ -->/g;
if (($chunk !~ /<!--/ ) && ($chunk =~ /[\w\W]/ )) {
push (@chunk_2, $chunk);
next;
} else {
while ($chunk =~ /<!--/) { # this chunk contains at least one comment
$_ = $chunk;
($before) = /^([\s\S]*?)<!--/;
if ($before =~ /[\w\W]/) {
push (@chunkettes, $before) if $before =~ /[\w\W]/;
$chunk = substr($chunk, length($before), (length($chunk) - length($before)));
}
$_ = $chunk;
($comment) = /(<!--[\S\s]*?-->)/;
push (@chunkettes, $comment);
$chunk = substr($chunk, length($comment), (length($chunk) - length($comment)));
$_ = $chunk;
($after) = /^([\s\S]*?)<!--/;
# we should get an "after" unless there are no more comments OR
# if the next comment is immediately after this one.
if ($after =~ /[\w\W]/) {
$chunk = substr($chunk, length($after), (length($chunk) - length($after)));
push (@chunkettes, $after);
} else {
next if $chunk =~ /\s*<!--/;
$_ = $chunk;
($after) = /^([\s\S]*)$/;
$chunk = '';
push (@chunkettes, $after);
}
$after = $comment = '';
} # close while ($chunk =~ /<!--/)
foreach $chunk(@chunkettes) {
push (@chunk_2, $chunk);
}
@chunkettes = ''; # reinitialize @chunkettes
}# close if ($chunk =~ /<!--/)
} #close foreach $chunk(@chunk_1)
my @sections;
my $section;
my $tagFlag = 1; # indicator for "<"
foreach $chunk(@chunk_2) {
if ($chunk =~ /^<script/ ) { # chunk consists of a script
my $scriptOpen = $chunk;
$scriptOpen =~ /^<script[^>]>/;
$scriptOpen =~ s/language/language/i;
$scriptOpen =~ s/type/type/i;
$chunk =~ s/^<script[^>]>/$scriptOpen/;
$chunk .= "\n";
&print_to_outfile($chunk);
next;
}
elsif ($chunk =~ /^<!--/ ) { # chunk consists of a comment
$chunk = lc($chunk) if $commentPreserve == 0;
&print_to_outfile($chunk);
next;
} else { # chunk is HTML and text, just text, just html, or text and HTML
# begin modification of HTML
if ($chunk !~ />/) { # this chunk contains only text
&print_to_outfile($chunk);
next;
}
if ($chunk !~ /^\s*</) { # this chunk begins with text
$tagFlag = 0; # needed for first section of chunk only
}
@sections = split("<", $chunk);
my $text;
my $tag;
foreach $section(@sections) {
next if $section !~ /\w/;
$section = "<" . $section if $tagFlag; # restore < to section
($text, $tag) = split("<", $section) if !$tagFlag;
($tag, $text) = split(">", $section) if $tagFlag;
if ($tag =~ /\w/) {
$tag = $tag . ">" if $tag !~ />\s*$/; # restore > to tag
$tag = $tag . "<" if $tag !~ /^\s*</; # restore > to tag
}
# find out if tag contains properties
if ($tag =~ /=/) {
# tag does contain properties
$section = &process_tag_with_properties($tag, $text, $tagFlag);
&print_to_outfile($section);
} else { # tag does not contain properties
$tag = lc($tag);
$tag=~ s/<br>/<br \/>/;
$tag =~ s/<p>/<br \/><br \/>/;
$tag =~ s/<\/p>//;
$section = $tag . $text if $tagFlag;
$section = $text . $tag if !$tagFlag;
&print_to_outfile($section);
}
$tagFlag = 1; # reinitialize for next section
$text = $tag = ''; # reinitialize for next section
} # close foreach section
} # close if chunk does not contain script
} # close foreach chunk
close INPUT;
close OUTPUT;
print "\n $inputfile processed. \n View new formatting in $outfile.\n";
}
print "\n Job complete.\n";
exit 0;
#*******************
sub query_continue {
my $continue = substr(<STDIN>, 0, 1); #take first character only
$continue =~ tr/a-z/A-Z/; #transform to upper case
if (( $continue ne "Y" ) && ( $continue ne "N" )) {
print "$continue is not a valid answer.\n";
$continue = "";
print "Do you wish to continue? (y or n)\n";
&query_continue;
}
if ( $continue eq "N" ) {
print "Quitting per user request.\n";
exit (0);
}
}
#*******************
sub get_filename {
print "\nWrite the name of the html file that needs its tags changed \n" .
"to lower case: ";
chomp(my $inputfile = <STDIN>); #remove "enter"
if ( $inputfile !~ /\w+\.htm/ ) {
print "$inputfile is not a valid file name.\n";
print "Do you wish to continue? (y or n)\n";
&query_continue;
}
return $inputfile;
}
#*******************
sub open_infile_make_outfile {
my $inputfile = shift;
unless (open( INPUT, $inputfile )){
print "Can't open $inputfile. Do you wish to rewrite? (y or n)\n";
&query_continue;
$inputfile = &get_filename;
&open_infile_make_outfile($inputfile);
}
my ($name, $extension) = split ('.ht', $inputfile);
$name = $name . "_x";
my $outfile = $name . ".ht" . $extension;
if ( -s $outfile ) {
print "\n$outfile exists.\nPlease delete or rename file and then rerun script.\n";
exit (0);
} else {
open( OUTPUT, ">>$outfile" ) or die "Can\'t open $outfile $!.";
return $outfile;
}
}
#*******************
sub break_html_into_script_chunks {
my ($html, $bytes_read)= @_;
my $afterScriptPosition = 0; # initialize for while loop
my $remainingHTML = $html; # initialize for while loop
my $scriptClosePosition = 0; # initialize for while loop
my $scriptOpenPosition = 0; # initialize for while loop
# chunks are strings that either contain no <script> or
# consist of one entire block of <script> to </script>
my @chunk_1;
while ($scriptOpenPosition != -1) {
$scriptOpenPosition = index($html, "<script", 0);
$scriptClosePosition = index($html, "<\/script>", 0);
if ($scriptOpenPosition > -1) {
my $scriptLength = $scriptClosePosition - $scriptOpenPosition + 9;
my $scriptChunk = substr($html, $scriptOpenPosition, $scriptLength);
my $HTMLChunk = substr($html, 0, $scriptOpenPosition);
# separate file into script portions and html portions
push(@chunk_1, $HTMLChunk) unless $HTMLChunk !~ /\w/;
push(@chunk_1, $scriptChunk) unless $scriptChunk !~ /\w/;
$afterScriptPosition = $scriptClosePosition +9;
$remainingHTML = substr($html, $afterScriptPosition, (length($html) - $afterScriptPosition));
$html = $remainingHTML;
} else {
# if we get here, there are no more <script> tags
push(@chunk_1, $remainingHTML) unless $remainingHTML !~ /\w/;
}
}
return @chunk_1;
}
#*******************
sub print_to_outfile {
my $section = shift;
print OUTPUT "$section";
}
#*******************
sub process_tag_with_properties {
my ($tag, $text, $tagFlag) = @_;
# there are several possible configurations for property-value pairs
# a=b | a="b" | a='b' | a="b, c, d" | a='b, c, d' | a = "b"
# difficult example <font size = "+1" color=red face='Arial, lucida,
# Geneva'>
# another example - note quotes: onmouseout="imgOff('img01')"
# flow logic: we don't know how to identify the end of each
# property-value pair, so until we can, all we know is that it
# will be positioned before the next = or >
# in addition there could be stand alone words such as NOWRAP or NOSHADE
# that aren't the name of the tag
$tag =~ s/=\s+/=/g; #remove any white space immediately after =
$tag =~ s/\s+=/=/g; #remove any white space immediately before =
my $startPosition = 0; # initialize for until loop
my $tagEnd = 0; # initialize for until loop
my $endPair; # initialize for until loop
my $property;
my $value;
my $pair;
my @goodPairs;
until ($tagEnd==1) {
my $eqPosition = index($tag, "=", 0);
# look for stand-alone characters which are either tag identifiers
# or properties with no equal signs
my $possStandAlone = substr($tag, $eqPosition, 0);
if ($possStandAlone =~ /(\w+)\s+\w*/ ) {
my $fixed1 = lc($1);
$pair = $fixed1;
$endPair = index($tag, " ", 0);
goto PUSHPAIR;
}
my $nextEqPosition = index($tag, "=", ($eqPosition +2));
my $tagEndPosition = index($tag, ">");
if ($nextEqPosition < 0 ) {
# there are no more equals in tag, make ">" the nextEqPosition
$nextEqPosition = $tagEndPosition;
$tagEnd = 1;
}
# test are there quotes between this eq position and before next one?
my $dubQuotPosition = index($tag, '"', $eqPosition);
my $singQuotPosition = index($tag, "'", $eqPosition);
# remember example: onmouseout="imgOff('img01')"
if (($dubQuotPosition > -1) && ($singQuotPosition > -1 )) {
# we have both double and single quotes
if ($dubQuotPosition > $singQuotPosition) {
# the value is delimited by single quotes
$endPair = index($tag, "'", ($singQuotPosition+1));
} else {
# the value is delimited by double quotes
$endPair = index($tag, '"', ($dubQuotPosition+1));
}
}
elsif (($singQuotPosition > $eqPosition) && ($singQuotPosition < $nextEqPosition)) {
# the value is delimited by single quotes
$endPair = index($tag, "'", ($singQuotPosition+1));
}
elsif (($dubQuotPosition > $eqPosition) && ($dubQuotPosition < $nextEqPosition)) {
# the value is delimited by double quotes
$endPair = index($tag, '"', ($dubQuotPosition+1));
} else {
# there are no quotes in and no spaces after value string
# the first space after the = sign is the end of the pair
# or the end of the tag, if last property
$endPair = index($tag, " ", $eqPosition);
my $closeTag = index($tag, ">", $eqPosition);
if ($endPair < 0) {
$endPair = $closeTag;
}
}
# at this point we have isolated a property or property-value pair
$value = substr($tag, $eqPosition, ($endPair-$eqPosition + 1));
$property = substr($tag, 0, $eqPosition);
$property = lc($property);
# gather all good data into reassembed tag here
$pair = $property . $value;
PUSHPAIR:
$pair =~ s/^\s*//; # strip off leading and trailing spaces
$pair =~ s/\s*$//;
push(@goodPairs, $pair);
$startPosition = $endPair + 1;
$tag = substr($tag, $startPosition, $tagEndPosition);
$property = $value = $pair = $endPair = 0;
} # end until ($tagEnd)
$tagEnd = 0;
$tag = join(" ", @goodPairs);
# reinitialize goodPairs
@goodPairs = "";
# restore > to the end of tag
$tag = $tag . ">" if $tag !~ />$/;
$tag = "<" .$tag if $tag !~ /^</;
my $section;
if ($text) {
$section = $tag . $text if $tagFlag;
$section = $text . $tag if !$tagFlag;
} else {
$section = $tag;
}
return $section;
}
#*********** END *********
|
|