package My::HTML::WordML2HTML; use strict; use Carp; use XML::LibXML; use MIME::Base64; # use Exporter; # No longer exports any functions use CGI::Pretty qw(-autoload); ## TODO More robust error checking. sub error { my ( $self ) = @_; return $self->{ error } || undef; } sub new { my ( $class, $wordML, $args ) = @_; my $self = {}; bless $self, $class; while ( my( $key, $value ) = each %$args ) { $self->{ $key } = $value ; } # check for missing values - fatal errors first # Did we remember to specify the file to process? do { push @{ $self->{ error } }, "Required parameter 'wordML' missing. No file to parse"; return $self; } unless $wordML; # does the file to process exist and can we read it? do { push @{ $self->{ error } }, "The file - $self->{ wordML } does not exist, or cannot be read. No file to parse"; return $self; } unless ( -e $wordML && -R $wordML ); # Make sure the file is a text file, and not a directory or a binary (i.e., in .doc firmat instead of .xml) do { push @{ $self->{ error } } , "$self->{ wordML } not a text file. No file to parse"; return $self; } unless ( -T $wordML && -f $wordML ); # OK so far. Store the file to parse $self->{ wordML } = $wordML; # store the modification date. $self->{ moddt } = ( stat $wordML )[9]; # non-fatal errors... # Do we have a directory to place images? if ( $self->{ images } ) { # Does the directory exist? do { push @{ $self->{ error } }, qq(The specified directory for saving images - $self->{ images } - does not exist. Image processing is not possible.); # can we create it? # TODO check for directory creation. } unless -e $self->{ images }; # is it a directory do { push @{ $self->{ error } }, qq($self->{ images } is not a directory. Image processing not possible.); } unless -d $self->{ images }; # can we write to it ? do { push @{ $self->{ error } }, qq(Cannot write to $self->{ images }. Image processing not possible.); undef $self->{ images }; } unless -W $self->{ images }; } else { push @{ $self->{ error } }, qq(No directory is specified for the saving images. Image processing is not possible); } # Make sure that any missing pieces are in the right format. $self->{ 'custom_meta' } ||= []; $self->{ ilvl } = -1; $self->{ 'jump_depth' } ||= 0; return $self; } sub set_url { my ( $self, $url ) = @_; $self->{ url } = $url; } sub wordML2html { my ( $self ) = @_; # parse the document and convert it to html. # Step 1 - initiate an XML::LibXML parser.. my $p = XML::LibXML->new(); # Step 2 - parse the file or kick us out setting an error my $data = eval{ $p->parse_file( $self->{ wordML } || die "Can't parse file $self->{ wordML }" ) }; if ( $@ ) { return $@; } # step 3 parse the document properties that will make the HTML head. ( $self->{ 'document_info'}, $self->{ 'html_meta' } ) = make_meta( $data, $self->{ 'custom_meta' } ); # step 4 Parse and store the list definitions if ( my ($lists) = $data->findnodes('//w:lists') ) { $self->{ lists } = $self->get_lists( $lists ) if $lists->hasChildNodes(); } # step 5 - parse the body of the document... # First, an internal reference to the lists in the document # Now find the Document body and process it. my ( $body ) = $data->findnodes('//w:body'); do { $self->{ error } = "No data to process"; return $self } unless $body->hasChildNodes(); foreach my $sect ( $body->findnodes('wx:sect') ) { $self->{ 'html_body' } .= $self->process_subsection( $sect ); } return $self; } sub get_lists { my ( $self, $lists ) = @_; my ( $masterlist, $defs ); my @def; if ( @def = $lists->findnodes( 'w:listDef' ) ){ foreach my $listDef ( @def ) { my $listDefId = $listDef->findvalue( '@w:listDefId' ); if ( my @lvl = $listDef->findnodes( 'w:lvl' )) { foreach my $lvl ( @lvl ) { $defs->{ $listDefId }[ $lvl->findvalue( '@w:ilvl' ) ] = process_lvl( $lvl ); } } } } my @list = $lists->findnodes( 'w:list' ); foreach ( @list ) { my $ilfo = $_->findvalue( '@w:ilfo' ); my $ilst = $_->findvalue( 'w:ilst/@w:val' ); $masterlist->{ $ilfo } = $defs->{ $ilst }; } return $masterlist; } sub process_lvl { my ( $lvl ) = @_; my $nfc = $lvl->findvalue( 'w:nfc/@w:val' ) || 0; my $start = $lvl->findvalue( 'w:start/@w:val' ) || 0; my @nfcs = ( qq(
    ); return $props; } # accessor method sub htmlBody { my $self = shift; return $self->{ 'html_body' } ; } # accessor method sub htmlMeta { my $self = shift; return "No meta information available. Perhaps you need to call wordML2html() to parse the file first?" unless ref $self->{ 'html_meta' } eq 'ARRAY'; return wantarray ? @{ $self->{ 'html_meta' } } : join("\n", @{ $self->{ 'html_meta' } } ); } # accessor method sub get_info { my $self = shift; return $self->{ 'document_info' } || "File info not set. Perhaps the file has not been parsed yet? Call wordML2html() to parse the XML file and set the document info."; } # accessor method sub docLists { my $self = shift; $self->{ lists }; } sub make_meta { my ( $data, $cfields ) = @_; return 'Not a XML::LibXML::Document reference' unless ref $data eq 'XML::LibXML::Document'; my %master_meta; if ( my ( $props ) = $data->findnodes('//o:DocumentProperties') ) { if ( my @nodes = $props->childNodes() ) { $master_meta{ lc( $_->localname() ) } = _unlegalize( $_->textContent() ) for ( @nodes ); } } if ( my ( $custom ) = $data->findnodes('//o:CustomDocumentProperties') ) { if ( my @nodes = $custom->childNodes() ) { $master_meta{ lc( $_->localname() ) } = _unlegalize( $_->textContent() ) for ( @nodes ); } } my @html_head; my @meta = qw(title keywords description author); #my @meta = keys %master_meta; for ( @meta, @$cfields ) { if ( $master_meta{ $_ } ) { push @html_head, qq() ; } } return ( \%master_meta, \@html_head ); } sub process_subsection { my ( $self, $sub ) = @_; return '' unless $sub->hasChildNodes(); $self->{ depth }++; my $text =''; my @nodes = $sub->childNodes(); for ( @nodes ) { my $name = $_->nodeName(); if ( $name =~ /w:p/ ) { $text .= $self->process_p_node( $_ ); } elsif ( $name =~ /w:tbl/ ) { $text.= $self->process_tbl( $_ ); } elsif ( $name =~ /sub-section/ ) { $text .= $self->process_subsection( $_ ); } elsif ( $name =~ /annotation/ ) { $text .= $self->process_annotation( $_ ); } } if ( $self->{ ilvl } > -1 ) { $text .= qq(
); } if ( $self->{ 'jump_depth' } > 0 ) { $text .= jumplink() if $self->{ depth } == $self->{ 'jump_depth' }; } $self->{ depth } --; return $text; } sub get_style_name { my ( $p ) = @_; my $class; my ( $pPr ) = $p->findnodes('w:pPr'); return unless ( $pPr && $pPr->hasChildNodes ); my $classname = $pPr->findvalue('w:pStyle/@w:val') ; my $style = ''; # grab the rest of the style and past them into a style decl. # again, this is incomplete... my $jc = $pPr->findvalue('w:jc/@w:val'); if ( $jc ) { $style = qq(style="text-align: $jc;); } # close the attribute list $style .= qq(") if $style; $class = qq(class="$classname") if $classname; return ( $class, $style ); } sub process_tbl { my ( $self, $tbl ) = @_; return '' unless $tbl->hasChildNodes(); my @nodes = $tbl->childNodes(); my $table; for my $node ( @nodes ) { for ( $node->nodeName() ) { /w:tr/ && do { $table .= $self->process_tr_node( $node ); last; }; } } return table( {-cellpadding=>0, -cellspacing=>0}, $table ); } sub process_tr_node { my ( $self, $tr ) = @_; return '' unless $tr->hasChildNodes(); my @nodes = $tr->childNodes(); my $row; # will hold all of the row ... for my $node ( @nodes ) { for ( $node->nodeName() ) { /w:tc/ && do { $row .= $self->process_tc_node( $node ); last; }; } } return Tr( $row ); } sub process_tc_node { my ( $self, $tc ) = @_; return '' unless $tc->hasChildNodes(); my @nodes = $tc->childNodes(); # first, grab some properties. There are plenty, as a crutch we're just getting the ones we care about. my ($tcPr) = $tc->findnodes('w:tcPr'); my $colspan = $tcPr->findvalue('w:gridSpan/@w:val') || 1; my $vAlign = $tcPr->findvalue('w:vAlign/@w:val' ) || 'top'; $vAlign = 'middle' if $vAlign eq 'center'; my $bgcolor = $tcPr->findvalue('w:shd/@w:fill') || 'none'; $bgcolor = '#' . $bgcolor unless $bgcolor eq 'none'; my $td; for my $node ( @nodes ) { for ( $node->nodeName() ) { /w:p/ && do { $td .= $self->process_p_node( $node ); last; }; # other possibilities exist ... } } if ( $self->{ ilvl } > -1 ) { $td .= "\n"; $self->{ ilvl } = -1; } return td({-valign => $vAlign, -colspan=>$colspan, -style=>"background-color: " . $bgcolor}, $td ); } sub process_p_node { my ( $self, $p ) = @_; return '' unless $p->hasChildNodes(); my ( $class, $style) = get_style_name( $p ); my $attr = join(" ", $class, $style); my $tag = $self->is_list( $p, $class ); my @nodes = $p->childNodes(); my $text; for my $node ( @nodes ) { for ( $node->nodeName() ) { /r/ && do { $text .= $self->process_r_node( $node ); last; }; /hlink/ && do { $text .= $self->process_hyperlink( $node ); last; }; } } if ( int( $self->{ ilvl } ) > -1 ) { return $tag . "\t" x ( $self-> { depth } + $self->{ ilvl } ) . qq(\t
  • $text
  • \n); } else { return $tag . "\t" x ( $self-> { depth } + $self->{ ilvl } ) . qq(\t

    $text

    \n); } } sub process_r_node { my ( $self, $r ) = @_; return '' unless $r->hasChildNodes(); my $text; if ( my @pics = $r->findnodes( 'w:pict' ) ) { $text .= $self->process_pict_node( $_ ) for ( @pics ); return $text; } # not an image ? # fetch the text of the run first for ( $r->findnodes('w:t') ) { $text .= $_->textContent(); } if ( my ( $rPr ) = $r->findnodes('w:rPr') ) { my @nodes = $rPr->childNodes(); for ( reverse @nodes ) { $text = html_tags( $_, $text ); } } return $text; } sub process_pict_node { my ( $self, $pict ) = @_; return undef unless $self->{ images }; # need to know where to put files. return undef unless $pict->hasChildNodes(); # we're curently only looking for two nodes, w:shape and w:binData # w:shape holds a file name, alt tags and style for my ( $path, $url ) = $self->get_path(); my $image = $pict->getElementsByTagName( 'w:binData' ); my ( $shape ) = $pict->getElementsByTagName( 'v:shape/v:imagedata') ; my $alt; my ( $s, $src ) = split'://', $shape->findvalue('@src'); #' my $type = get_file_type( $src ); $alt = $shape->findvalue('@o:title' ); if ( $alt ) { # $path is package global set when the main subroutine is first called my $imod = (stat "$path/$alt.$type")[9]; return qq(\n\t\t$alt\n) unless $imod < $self->{ moddt }; eval{ open IMG, ">$path/$alt.$type" or die "$! $path/$alt.$type"; }; if ( $@ ) { warn $@; return undef; } print IMG decode_base64( $image->to_literal ); close IMG or die "$!"; # return the HTML tags return qq(\n\t\t$alt\n) ; } } sub process_hyperlink { my ( $self, $link ) = @_; my $mark = make_html_legal( $link->findvalue('@w:bookmark') ) || ''; my $href = ''; my $target = ''; if ( my $hlink = $link->findvalue( '@w:dest' ) ) { ( $href = $hlink ) =~ s/\%3f/\?/g; #$href = $hlink; } if ( $href =~ /^http/ ) { $target = ' target=_blank'; } else { if ( ! $self=> { 'absolute_urls' } ) { $href = $self->{'base_url' } . $href if $href; } if ( $href =~ /pdf$/ ) { $target = ' target=_blank'; } } $href .= "#" . $mark if $mark; my $text; for ( $link->findnodes('w:r') ) { $text .= $self->process_r_node( $_ ); } return qq($text); } sub process_annotation { my ( $self, $anno ) = @_; my $name; if ( my $type = $anno->findvalue('@w:type' ) ) { if ( $type eq 'Word.Bookmark.Start' ) { $name = make_html_legal( $anno->findvalue('@w:name') ); } } $name ? return qq() : return ''; } sub get_path { use File::Basename; my $self = shift; # a convenience .. my $d = $self->{ images }; my ( $url, $fn ); # Did we send in a directory instead of the filename? if ( -e $d && -d $d ) { chop $d if ( $d =~ '/$'); # If we passed in a directory name, we consider that a subfolder $fn = ""; } else { # If we sent a filename, we attempt to figure out first whether we can determine the enclosing directory. ( $fn, $d, undef ) = fileparse( $self->{ wordML }, qr{\..*} ); if ( ! -d $d ) { carp "Path does not exist"; return undef; } chop $d if ( $d =~ '/$'); } # now $d is our base directory. Append [filename]_filelist my $path = ( $fn ? $d . "${fn}_filelist" : $d ); if ( ! -e $path ) { eval{ mkdir $path, 0777 or die "$!" }; if ( $@ ) { carp $@; return undef; } } if ( index( $path, $self->{'doc_root'} ) == 0 ) { $url = substr( $path, length $self->{'doc_root'} ), "\n"; } else { croak "File or Directory $path Not in the Server's Document Root: $self->{'doc_root'} "; } if ( $url && $self->{ 'absolute_urls' } ) { $url = '/' . $url; } return ($path, $url); } sub get_file_type { my $src = shift; use File::Basename; my (undef, undef, $ext ) = fileparse( $src, qr{\..*} ); my @ext = split/\./, $ext; return pop @ext; } sub make_html_legal { $_[0] = substr($_[0], 0, 64); #$_[0] =~ s/[\W_]+//g; return $_[0]; } sub html_tags { my ($node, $text) = @_; my $name = $node->nodeName(); for ( $name ) { /w:b$/i && do { $text = b( $text ); last; }; /w:i/i && do { $text = i( $text ); last; }; /w:rStyle/i && do { my $s = $node->findvalue('@w:val'); $text = span({-class=>$s}, $text ); last; }; # more to come ... } return $text; } sub is_list { my ($self, $p, $class) = @_; my ( $ilvl, $ilfo, $dif, $tag ); if ( my( $pPr ) = $p->findnodes('w:pPr/w:listPr') ) { # is a list $ilvl = $pPr->findvalue('w:ilvl/@w:val'); $ilfo = $pPr->findvalue('w:ilfo/@w:val'); #warn "ilfo = $ilfo\n"; } else { # not a list $ilvl = -1; } my $cur = $self->{ ilvl }; my $dif = int( $ilvl ) - int( $cur ); if ( $dif == 0 ) { $tag = ''; } elsif ( $dif > 0 ) { for (1..$dif) { $tag .= "\t" x ( $self->{ depth } + $ilvl ) . $self->{ lists }{ $ilfo }[ $ilvl ]; } } elsif ( $dif < 0 ) { $dif *= -1; for (reverse( 1..$dif) ) { $tag .= "\t" x ( ( $self->{ depth } + $_ ) ). "\n"; } } $self->{ ilvl } = $ilvl; return $tag; } # _unlegalize converts hex values into something that the user can read better. sub _unlegalize { my $val = shift; # Translate Hex Values like _x020_ into a space. Called for items that need to # be read by a Web viewer as text. Don't use it on values that have to go back into XML elements # as the name suggests, they won't be legal any longer. $val =~ s/_x([[:xdigit:]]+)_/chr hex $1/eg; # Trim leading/trailing spaces $val =~ s/^\s+//; $val =~ s/\s+$//; return $val; } sub jumplink { return qq( ); } 1;