Anciennes révisionsLiens de retourExporter en PDFHaut de page Share via Share via... Twitter LinkedIn Facebook Pinterest Telegram WhatsApp Yammer RedditDerniers changementsSend via e-MailImprimerPermalien × sxw2txt #!/usr/bin/perl -w # [[sxw2txt]] -- Coverts OpenOffice.org Writer files to plain text. # Copyright (C) 2004 Liam Morland # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, # USA. # # Liam Morland <Liam@Morland.ca> <http://Liam.Morland.ca/> # 86A McDougall Road, Waterloo, Ontario, N2L 5C5, CANADA #modified radeff 2005 use strict; # First argument is taken to be the input file. All other args are ignored. my $input_file = shift; # If we have a filename, try to get the content.xml from it, # otherwise print usage information. if ($input_file){ $_ = `unzip -p $input_file content.xml 2>/dev/null`; } else { print "sxw2txt: Coverts OpenOffice.org Writer files to plain text.\n"; print "Usage: sxw2txt input-file\n"; exit(1); } # If we don't have any content.xml, exit with an error. if (!$_){ print "sxw2txt: Error: $input_file is probably not an OpenOffice.org file.\n"; exit(2); } # Convert the OOo XML to text with a series of regex substitutions. s,\n+, ,g; # Tables are wrapped with [begin-table] and [end-table]. # Rows and cells begin with [table-row] and [table-cell] respectively. # modif radeff #s,<table:table( [^>]*)?>,\n\n[begin-table],g; #s,</table:table>,\n[end-table],g; #s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n[table cell],g; #s,<table:table-row( [^>]*)?>,\n\n[table row],g; s,<table:table( [^>]*)?>,\n,g; s,</table:table>,\n,g; s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n,g; s,<table:table-row( [^>]*)?>,\n,g; # end modif # OOo tabs are made into tab characters. s,<text:tab-stop/>,\t,g; # Each list item is given a '*' as a bullet. # Sorry, no fancy support for nested lists yet. s,<text:list-item><text:p[^>]*>,\n\n* ,g; # Skip two lines before each new paragraph. s,<text:p[^>]*>,\n\n,g; # ajout radeff s,<text:line-break/>,\n,g; # Get rid of any remaining tags. Want to add support for tags not # handled above? Do it above this line. s,<[^>]*>,,g; # Convert common entities into the appropriate character. s,<,<,g; s,>,>,g; s,',',g; s,",",g; s,&,&,g; s,é,é,g; s,Ú,è,g; s,â,',g; s,Ã,à,g; s,à¢,â,g; s,à®,î,g; s,à§,ç,g; s, , ,g; s,àŒ,ü,g; s,Â,\n\n,g; s,à«,ë,g; # Remove extra whitespace and print the result, always ending with \n. s,\n\n\n+,\n\n,sg; s,^\s*(.+)\s*$,$1,s; print "$_\n"; sxw2txt.sh #! /usr/bin/bash # Script to automatically convert openoffice sxw files to plain txt files # Usage: # Required: http://lists.debian.org/debian-wnpp/2004/12/msg00289.html sxw2text # apt-get: NO # # Authors: # FR, radeff@akademia.ch # History # 2005.11.18: FR, created # To do: ######### echo "Script to automatically convert word doc files to plain txt files" echo "************" WD=pwd echo "Now converting all files under" .$WD echo "************" declare -i j j=0 find . -name "*.sxw" | while read i do j=$j+1 echo "$j Converting $i TO $i.txt" sxw2txt "$i" > "$i.txt" done echo "************" #echo "Finished, $k files converted" echo "Finished, all files converted" info/sxw2txt.txt Dernière modification : 2018/07/18 09:46de radeff S'identifier