#!/usr/bin/perl -w
# [[sxw2txt]] -- Coverts OpenOffice.org Writer files to plain text.
# Copyright (C) 2004 Liam Morland
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
# USA.
#
# Liam Morland <Liam@Morland.ca> <http://Liam.Morland.ca/>
# 86A McDougall Road, Waterloo, Ontario, N2L 5C5, CANADA
#modified radeff 2005
use strict;
# First argument is taken to be the input file. All other args are ignored.
my $input_file = shift;
# If we have a filename, try to get the content.xml from it,
# otherwise print usage information.
if ($input_file){
$_ = `unzip -p $input_file content.xml 2>/dev/null`;
} else {
print "sxw2txt: Coverts OpenOffice.org Writer files to plain text.\n";
print "Usage: sxw2txt input-file\n";
exit(1);
}
# If we don't have any content.xml, exit with an error.
if (!$_){
print "sxw2txt: Error: $input_file is probably not an OpenOffice.org file.\n";
exit(2);
}
# Convert the OOo XML to text with a series of regex substitutions.
s,\n+, ,g;
# Tables are wrapped with [begin-table] and [end-table].
# Rows and cells begin with [table-row] and [table-cell] respectively.
# modif radeff
#s,<table:table( [^>]*)?>,\n\n[begin-table],g;
#s,</table:table>,\n[end-table],g;
#s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n[table cell],g;
#s,<table:table-row( [^>]*)?>,\n\n[table row],g;
s,<table:table( [^>]*)?>,\n,g;
s,</table:table>,\n,g;
s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n,g;
s,<table:table-row( [^>]*)?>,\n,g;
# end modif
# OOo tabs are made into tab characters.
s,<text:tab-stop/>,\t,g;
# Each list item is given a '*' as a bullet.
# Sorry, no fancy support for nested lists yet.
s,<text:list-item><text:p[^>]*>,\n\n* ,g;
# Skip two lines before each new paragraph.
s,<text:p[^>]*>,\n\n,g;
# ajout radeff
s,<text:line-break/>,\n,g;
# Get rid of any remaining tags. Want to add support for tags not
# handled above? Do it above this line.
s,<[^>]*>,,g;
# Convert common entities into the appropriate character.
s,<,<,g;
s,>,>,g;
s,',',g;
s,",",g;
s,&,&,g;
s,é,é,g;
s,Ú,è,g;
s,â,',g;
s,Ã,à,g;
s,à¢,â,g;
s,à®,î,g;
s,à§,ç,g;
s, , ,g;
s,àŒ,ü,g;
s,Â,\n\n,g;
s,à«,ë,g;
# Remove extra whitespace and print the result, always ending with \n.
s,\n\n\n+,\n\n,sg;
s,^\s*(.+)\s*$,$1,s;
print "$_\n";