package My::HTML::WordML2HTML;
use strict;
use Carp;
use XML::LibXML;
use MIME::Base64;
# use Exporter; # No longer exports any functions
use CGI::Pretty qw(-autoload);
## TODO More robust error checking.
sub error {
my ( $self ) = @_;
return $self->{ error } || undef;
}
sub new {
my ( $class, $wordML, $args ) = @_;
my $self = {};
bless $self, $class;
while ( my( $key, $value ) = each %$args ) {
$self->{ $key } = $value ;
}
# check for missing values - fatal errors first
# Did we remember to specify the file to process?
do {
push @{ $self->{ error } }, "Required parameter 'wordML' missing. No file to parse";
return $self;
} unless $wordML;
# does the file to process exist and can we read it?
do {
push @{ $self->{ error } }, "The file - $self->{ wordML } does not exist, or cannot be read. No file to parse";
return $self;
} unless ( -e $wordML && -R $wordML );
# Make sure the file is a text file, and not a directory or a binary (i.e., in .doc firmat instead of .xml)
do {
push @{ $self->{ error } } , "$self->{ wordML } not a text file. No file to parse";
return $self;
} unless ( -T $wordML && -f $wordML );
# OK so far. Store the file to parse
$self->{ wordML } = $wordML;
# store the modification date.
$self->{ moddt } = ( stat $wordML )[9];
# non-fatal errors...
# Do we have a directory to place images?
if ( $self->{ images } ) {
# Does the directory exist?
do {
push @{ $self->{ error } }, qq(The specified directory for saving images - $self->{ images } - does not exist.
Image processing is not possible.);
# can we create it?
# TODO check for directory creation.
} unless -e $self->{ images };
# is it a directory
do {
push @{ $self->{ error } }, qq($self->{ images } is not a directory. Image processing not possible.);
} unless -d $self->{ images };
# can we write to it ?
do {
push @{ $self->{ error } }, qq(Cannot write to $self->{ images }. Image processing not possible.);
undef $self->{ images };
} unless -W $self->{ images };
}
else {
push @{ $self->{ error } }, qq(No directory is specified for the saving images. Image processing is not possible);
}
# Make sure that any missing pieces are in the right format.
$self->{ 'custom_meta' } ||= [];
$self->{ ilvl } = -1;
$self->{ 'jump_depth' } ||= 0;
return $self;
}
sub set_url {
my ( $self, $url ) = @_;
$self->{ url } = $url;
}
sub wordML2html {
my ( $self ) = @_;
# parse the document and convert it to html.
# Step 1 - initiate an XML::LibXML parser..
my $p = XML::LibXML->new();
# Step 2 - parse the file or kick us out setting an error
my $data = eval{ $p->parse_file( $self->{ wordML } || die "Can't parse file $self->{ wordML }" ) };
if ( $@ ) {
return $@;
}
# step 3 parse the document properties that will make the HTML head.
( $self->{ 'document_info'}, $self->{ 'html_meta' } ) = make_meta( $data, $self->{ 'custom_meta' } );
# step 4 Parse and store the list definitions
if ( my ($lists) = $data->findnodes('//w:lists') ) {
$self->{ lists } = $self->get_lists( $lists ) if $lists->hasChildNodes();
}
# step 5 - parse the body of the document...
# First, an internal reference to the lists in the document
# Now find the Document body and process it.
my ( $body ) = $data->findnodes('//w:body');
do {
$self->{ error } = "No data to process";
return $self
} unless $body->hasChildNodes();
foreach my $sect ( $body->findnodes('wx:sect') ) {
$self->{ 'html_body' } .= $self->process_subsection( $sect );
}
return $self;
}
sub get_lists {
my ( $self, $lists ) = @_;
my ( $masterlist, $defs );
my @def;
if ( @def = $lists->findnodes( 'w:listDef' ) ){
foreach my $listDef ( @def ) {
my $listDefId = $listDef->findvalue( '@w:listDefId' );
if ( my @lvl = $listDef->findnodes( 'w:lvl' )) {
foreach my $lvl ( @lvl ) {
$defs->{ $listDefId }[ $lvl->findvalue( '@w:ilvl' ) ] = process_lvl( $lvl );
}
}
}
}
my @list = $lists->findnodes( 'w:list' );
foreach ( @list ) {
my $ilfo = $_->findvalue( '@w:ilfo' );
my $ilst = $_->findvalue( 'w:ilst/@w:val' );
$masterlist->{ $ilfo } = $defs->{ $ilst };
}
return $masterlist;
}
sub process_lvl {
my ( $lvl ) = @_;
my $nfc = $lvl->findvalue( 'w:nfc/@w:val' ) || 0;
my $start = $lvl->findvalue( 'w:start/@w:val' ) || 0;
my @nfcs = (
qq(
);
return $props;
}
# accessor method
sub htmlBody {
my $self = shift;
return $self->{ 'html_body' } ;
}
# accessor method
sub htmlMeta {
my $self = shift;
return "No meta information available. Perhaps you need to call wordML2html() to parse the file first?" unless ref $self->{ 'html_meta' } eq 'ARRAY';
return wantarray ? @{ $self->{ 'html_meta' } } : join("\n", @{ $self->{ 'html_meta' } } );
}
# accessor method
sub get_info {
my $self = shift;
return $self->{ 'document_info' } || "File info not set. Perhaps the file has not been parsed yet? Call wordML2html() to parse the XML file and set the document info.";
}
# accessor method
sub docLists {
my $self = shift;
$self->{ lists };
}
sub make_meta {
my ( $data, $cfields ) = @_;
return 'Not a XML::LibXML::Document reference' unless ref $data eq 'XML::LibXML::Document';
my %master_meta;
if ( my ( $props ) = $data->findnodes('//o:DocumentProperties') ) {
if ( my @nodes = $props->childNodes() ) {
$master_meta{ lc( $_->localname() ) } = _unlegalize( $_->textContent() ) for ( @nodes );
}
}
if ( my ( $custom ) = $data->findnodes('//o:CustomDocumentProperties') ) {
if ( my @nodes = $custom->childNodes() ) {
$master_meta{ lc( $_->localname() ) } = _unlegalize( $_->textContent() ) for ( @nodes );
}
}
my @html_head;
my @meta = qw(title keywords description author);
#my @meta = keys %master_meta;
for ( @meta, @$cfields ) {
if ( $master_meta{ $_ } ) {
push @html_head, qq() ;
}
}
return ( \%master_meta, \@html_head );
}
sub process_subsection {
my ( $self, $sub ) = @_;
return '' unless $sub->hasChildNodes();
$self->{ depth }++;
my $text ='';
my @nodes = $sub->childNodes();
for ( @nodes ) {
my $name = $_->nodeName();
if ( $name =~ /w:p/ ) {
$text .= $self->process_p_node( $_ );
}
elsif ( $name =~ /w:tbl/ ) {
$text.= $self->process_tbl( $_ );
}
elsif ( $name =~ /sub-section/ ) {
$text .= $self->process_subsection( $_ );
}
elsif ( $name =~ /annotation/ ) {
$text .= $self->process_annotation( $_ );
}
}
if ( $self->{ ilvl } > -1 ) {
$text .= qq(
);
}
if ( $self->{ 'jump_depth' } > 0 ) {
$text .= jumplink() if $self->{ depth } == $self->{ 'jump_depth' };
}
$self->{ depth } --;
return $text;
}
sub get_style_name {
my ( $p ) = @_;
my $class;
my ( $pPr ) = $p->findnodes('w:pPr');
return unless ( $pPr && $pPr->hasChildNodes );
my $classname = $pPr->findvalue('w:pStyle/@w:val') ;
my $style = '';
# grab the rest of the style and past them into a style decl.
# again, this is incomplete...
my $jc = $pPr->findvalue('w:jc/@w:val');
if ( $jc ) {
$style = qq(style="text-align: $jc;);
}
# close the attribute list
$style .= qq(") if $style;
$class = qq(class="$classname") if $classname;
return ( $class, $style );
}
sub process_tbl {
my ( $self, $tbl ) = @_;
return '' unless $tbl->hasChildNodes();
my @nodes = $tbl->childNodes();
my $table;
for my $node ( @nodes ) {
for ( $node->nodeName() ) {
/w:tr/ && do { $table .= $self->process_tr_node( $node ); last; };
}
}
return table( {-cellpadding=>0, -cellspacing=>0}, $table );
}
sub process_tr_node {
my ( $self, $tr ) = @_;
return '' unless $tr->hasChildNodes();
my @nodes = $tr->childNodes();
my $row; # will hold all of the row ...
for my $node ( @nodes ) {
for ( $node->nodeName() ) {
/w:tc/ && do { $row .= $self->process_tc_node( $node ); last; };
}
}
return Tr( $row );
}
sub process_tc_node {
my ( $self, $tc ) = @_;
return '' unless $tc->hasChildNodes();
my @nodes = $tc->childNodes();
# first, grab some properties. There are plenty, as a crutch we're just getting the ones we care about.
my ($tcPr) = $tc->findnodes('w:tcPr');
my $colspan = $tcPr->findvalue('w:gridSpan/@w:val') || 1;
my $vAlign = $tcPr->findvalue('w:vAlign/@w:val' ) || 'top';
$vAlign = 'middle' if $vAlign eq 'center';
my $bgcolor = $tcPr->findvalue('w:shd/@w:fill') || 'none';
$bgcolor = '#' . $bgcolor unless $bgcolor eq 'none';
my $td;
for my $node ( @nodes ) {
for ( $node->nodeName() ) {
/w:p/ && do { $td .= $self->process_p_node( $node ); last; };
# other possibilities exist ...
}
}
if ( $self->{ ilvl } > -1 ) {
$td .= "\n";
$self->{ ilvl } = -1;
}
return td({-valign => $vAlign, -colspan=>$colspan, -style=>"background-color: " . $bgcolor}, $td );
}
sub process_p_node {
my ( $self, $p ) = @_;
return '' unless $p->hasChildNodes();
my ( $class, $style) = get_style_name( $p );
my $attr = join(" ", $class, $style);
my $tag = $self->is_list( $p, $class );
my @nodes = $p->childNodes();
my $text;
for my $node ( @nodes ) {
for ( $node->nodeName() ) {
/r/ && do { $text .= $self->process_r_node( $node ); last; };
/hlink/ && do { $text .= $self->process_hyperlink( $node ); last; };
}
}
if ( int( $self->{ ilvl } ) > -1 ) {
return $tag . "\t" x ( $self-> { depth } + $self->{ ilvl } ) . qq(\t$text\n);
}
else {
return $tag . "\t" x ( $self-> { depth } + $self->{ ilvl } ) . qq(\t$text
\n);
}
}
sub process_r_node {
my ( $self, $r ) = @_;
return '' unless $r->hasChildNodes();
my $text;
if ( my @pics = $r->findnodes( 'w:pict' ) ) {
$text .= $self->process_pict_node( $_ ) for ( @pics );
return $text;
}
# not an image ?
# fetch the text of the run first
for ( $r->findnodes('w:t') ) {
$text .= $_->textContent();
}
if ( my ( $rPr ) = $r->findnodes('w:rPr') ) {
my @nodes = $rPr->childNodes();
for ( reverse @nodes ) {
$text = html_tags( $_, $text );
}
}
return $text;
}
sub process_pict_node {
my ( $self, $pict ) = @_;
return undef unless $self->{ images }; # need to know where to put files.
return undef unless $pict->hasChildNodes();
# we're curently only looking for two nodes, w:shape and w:binData
# w:shape holds a file name, alt tags and style for
my ( $path, $url ) = $self->get_path();
my $image = $pict->getElementsByTagName( 'w:binData' );
my ( $shape ) = $pict->getElementsByTagName( 'v:shape/v:imagedata') ;
my $alt;
my ( $s, $src ) = split'://', $shape->findvalue('@src'); #'
my $type = get_file_type( $src );
$alt = $shape->findvalue('@o:title' );
if ( $alt ) {
# $path is package global set when the main subroutine is first called
my $imod = (stat "$path/$alt.$type")[9];
return qq(\n\t\t
\n) unless $imod < $self->{ moddt };
eval{ open IMG, ">$path/$alt.$type" or die "$! $path/$alt.$type"; };
if ( $@ ) {
warn $@;
return undef;
}
print IMG decode_base64( $image->to_literal );
close IMG or die "$!";
# return the HTML tags
return qq(\n\t\t
\n) ;
}
}
sub process_hyperlink {
my ( $self, $link ) = @_;
my $mark = make_html_legal( $link->findvalue('@w:bookmark') ) || '';
my $href = '';
my $target = '';
if ( my $hlink = $link->findvalue( '@w:dest' ) ) {
( $href = $hlink ) =~ s/\%3f/\?/g;
#$href = $hlink;
}
if ( $href =~ /^http/ ) {
$target = ' target=_blank';
}
else {
if ( ! $self=> { 'absolute_urls' } ) {
$href = $self->{'base_url' } . $href if $href;
}
if ( $href =~ /pdf$/ ) {
$target = ' target=_blank';
}
}
$href .= "#" . $mark if $mark;
my $text;
for ( $link->findnodes('w:r') ) {
$text .= $self->process_r_node( $_ );
}
return qq($text);
}
sub process_annotation {
my ( $self, $anno ) = @_;
my $name;
if ( my $type = $anno->findvalue('@w:type' ) ) {
if ( $type eq 'Word.Bookmark.Start' ) {
$name = make_html_legal( $anno->findvalue('@w:name') );
}
}
$name ? return qq() : return '';
}
sub get_path {
use File::Basename;
my $self = shift;
# a convenience ..
my $d = $self->{ images };
my ( $url, $fn );
# Did we send in a directory instead of the filename?
if ( -e $d && -d $d ) {
chop $d if ( $d =~ '/$');
# If we passed in a directory name, we consider that a subfolder
$fn = "";
}
else {
# If we sent a filename, we attempt to figure out first whether we can determine the enclosing directory.
( $fn, $d, undef ) = fileparse( $self->{ wordML }, qr{\..*} );
if ( ! -d $d ) {
carp "Path does not exist";
return undef;
}
chop $d if ( $d =~ '/$');
}
# now $d is our base directory. Append [filename]_filelist
my $path = ( $fn ? $d . "${fn}_filelist" : $d );
if ( ! -e $path ) {
eval{ mkdir $path, 0777 or die "$!" };
if ( $@ ) {
carp $@;
return undef;
}
}
if ( index( $path, $self->{'doc_root'} ) == 0 ) {
$url = substr( $path, length $self->{'doc_root'} ), "\n";
}
else {
croak "File or Directory $path Not in the Server's Document Root: $self->{'doc_root'} ";
}
if ( $url && $self->{ 'absolute_urls' } ) {
$url = '/' . $url;
}
return ($path, $url);
}
sub get_file_type {
my $src = shift;
use File::Basename;
my (undef, undef, $ext ) = fileparse( $src, qr{\..*} );
my @ext = split/\./, $ext;
return pop @ext;
}
sub make_html_legal {
$_[0] = substr($_[0], 0, 64);
#$_[0] =~ s/[\W_]+//g;
return $_[0];
}
sub html_tags {
my ($node, $text) = @_;
my $name = $node->nodeName();
for ( $name ) {
/w:b$/i && do { $text = b( $text ); last; };
/w:i/i && do { $text = i( $text ); last; };
/w:rStyle/i && do { my $s = $node->findvalue('@w:val');
$text = span({-class=>$s}, $text );
last;
};
# more to come ...
}
return $text;
}
sub is_list {
my ($self, $p, $class) = @_;
my ( $ilvl, $ilfo, $dif, $tag );
if ( my( $pPr ) = $p->findnodes('w:pPr/w:listPr') ) {
# is a list
$ilvl = $pPr->findvalue('w:ilvl/@w:val');
$ilfo = $pPr->findvalue('w:ilfo/@w:val');
#warn "ilfo = $ilfo\n";
}
else {
# not a list
$ilvl = -1;
}
my $cur = $self->{ ilvl };
my $dif = int( $ilvl ) - int( $cur );
if ( $dif == 0 ) {
$tag = '';
}
elsif ( $dif > 0 ) {
for (1..$dif) {
$tag .= "\t" x ( $self->{ depth } + $ilvl ) . $self->{ lists }{ $ilfo }[ $ilvl ];
}
}
elsif ( $dif < 0 ) {
$dif *= -1;
for (reverse( 1..$dif) ) {
$tag .= "\t" x ( ( $self->{ depth } + $_ ) ). "\n";
}
}
$self->{ ilvl } = $ilvl;
return $tag;
}
# _unlegalize converts hex values into something that the user can read better.
sub _unlegalize {
my $val = shift;
# Translate Hex Values like _x020_ into a space. Called for items that need to
# be read by a Web viewer as text. Don't use it on values that have to go back into XML elements
# as the name suggests, they won't be legal any longer.
$val =~ s/_x([[:xdigit:]]+)_/chr hex $1/eg;
# Trim leading/trailing spaces
$val =~ s/^\s+//;
$val =~ s/\s+$//;
return $val;
}
sub jumplink {
return qq(
[
^top
]
);
}
1;