diff --git a/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java b/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java index 14dbca540..560bc253a 100644 --- a/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java +++ b/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java @@ -34,23 +34,32 @@ public abstract class AbstractUnicodeExtraField implements ZipExtraField { } /** - * Assemble as unicode path extension form the name and encoding - * of the orginal zip entry. + * Assemble as unicode extension from the name/comment and + * encoding of the orginal zip entry. * - * @param name The file name or comment. + * @param text The file name or comment. * @param zipEncoding The encoding of the filenames in the zip * file, usually "CP437". */ - protected AbstractUnicodeExtraField(String name, String zipEncoding) { - - byte[] filename = ZipEncodingHelper.encodeName(name, zipEncoding); + protected AbstractUnicodeExtraField(String text, String zipEncoding) { + this(text, ZipEncodingHelper.encodeName(text, zipEncoding)); + } + /** + * Assemble as unicode extension from the name/comment and + * encoding of the orginal zip entry. + * + * @param text The file name or comment. + * @param zipEncoding The encoding of the filenames in the zip + * file, usually "CP437". + */ + protected AbstractUnicodeExtraField(String text, byte[] bytes) { CRC32 crc32 = new CRC32(); - crc32.update(filename); + crc32.update(bytes); nameCRC32 = crc32.getValue(); try { - unicodeName = name.getBytes("UTF-8"); + unicodeName = text.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("FATAL: UTF-8 encoding not supported.", e); diff --git a/src/main/org/apache/tools/zip/UnicodeCommentExtraField.java b/src/main/org/apache/tools/zip/UnicodeCommentExtraField.java index ab6269d55..ca100a548 100644 --- a/src/main/org/apache/tools/zip/UnicodeCommentExtraField.java +++ b/src/main/org/apache/tools/zip/UnicodeCommentExtraField.java @@ -42,15 +42,26 @@ public class UnicodeCommentExtraField extends AbstractUnicodeExtraField { } /** - * Assemble as unicode comment extension form the comment and + * Assemble as unicode comment extension from the comment and * encoding of the orginal zip entry. * - * @param name The file name + * @param comment The file comment * @param zipEncoding The encoding of the comment in the zip file, * usually "CP437". */ - public UnicodeCommentExtraField(String name, String zipEncoding) { - super(name, zipEncoding); + public UnicodeCommentExtraField(String comment, String zipEncoding) { + super(comment, zipEncoding); + } + + /** + * Assemble as unicode comment extension from the comment given as + * text as well as the bytes actually written to the archive. + * + * @param comment The file comment + * @param bytes the bytes actually written to the archive + */ + public UnicodeCommentExtraField(String comment, byte[] bytes) { + super(comment, bytes); } public ZipShort getHeaderId() { diff --git a/src/main/org/apache/tools/zip/UnicodePathExtraField.java b/src/main/org/apache/tools/zip/UnicodePathExtraField.java index f2381b20d..dd3359a3d 100644 --- a/src/main/org/apache/tools/zip/UnicodePathExtraField.java +++ b/src/main/org/apache/tools/zip/UnicodePathExtraField.java @@ -42,7 +42,7 @@ public class UnicodePathExtraField extends AbstractUnicodeExtraField { } /** - * Assemble as unicode path extension form the name and encoding + * Assemble as unicode path extension from the name and encoding * of the orginal zip entry. * * @param name The file name @@ -53,6 +53,17 @@ public class UnicodePathExtraField extends AbstractUnicodeExtraField { super(name, zipEncoding); } + /** + * Assemble as unicode path extension from the name given as + * text as well as the bytes actually written to the archive. + * + * @param name The file name + * @param bytes the bytes actually written to the archive + */ + public UnicodePathExtraField(String name, byte[] bytes) { + super(name, bytes); + } + public ZipShort getHeaderId() { return UPATH_ID; } diff --git a/src/main/org/apache/tools/zip/ZipEncodingHelper.java b/src/main/org/apache/tools/zip/ZipEncodingHelper.java index 92e759d86..1b5d16e97 100644 --- a/src/main/org/apache/tools/zip/ZipEncodingHelper.java +++ b/src/main/org/apache/tools/zip/ZipEncodingHelper.java @@ -75,17 +75,23 @@ abstract class ZipEncodingHelper { * * * @param name The filename or comment with possible non-ASCII - * unicode characters. + * unicode characters. Must not be null. * @param encoding A valid encoding name. The standard zip * encoding is "CP437", * "UTF-8" is supported in ZIP file - * version 6.3 or later. + * version 6.3 or later. If null, + * will use the platform's {@link + * java.lang.String#getBytes default encoding}. * @return A byte array containing the mapped file * name. Unmappable characters or malformed character * sequences are mapped to a sequence of utf-16 words * encoded in the format %Uxxxx. */ static final byte[] encodeName(String name, String encoding) { + if (encoding == null) { + return name.getBytes(); + } + Charset cs = Charset.forName(encoding); CharsetEncoder enc = cs.newEncoder(); @@ -178,8 +184,12 @@ abstract class ZipEncodingHelper { * "UTF-8" is supported in ZIP file * version 6.3 or later. */ - static final String decodeName(byte[] name, String encoding) { + static final String decodeName(byte[] name, String encoding) + throws java.nio.charset.CharacterCodingException { Charset cs = Charset.forName(encoding); - return cs.decode(ByteBuffer.wrap(name)).toString(); + return cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .decode(ByteBuffer.wrap(name)).toString(); } } diff --git a/src/main/org/apache/tools/zip/ZipEntry.java b/src/main/org/apache/tools/zip/ZipEntry.java index 8c9b93220..681349efb 100644 --- a/src/main/org/apache/tools/zip/ZipEntry.java +++ b/src/main/org/apache/tools/zip/ZipEntry.java @@ -263,6 +263,18 @@ public class ZipEntry extends java.util.zip.ZipEntry implements Cloneable { setExtra(); } + /** + * Looks up an extra field by its header id. + * + * @return null if no such field exists. + */ + public ZipExtraField getExtraField(ZipShort type) { + if (extraFields != null) { + return (ZipExtraField) extraFields.get(type); + } + return null; + } + /** * Throws an Exception if extra data cannot be parsed into extra fields. * @param extra an array of bytes to be parsed into extra fields diff --git a/src/main/org/apache/tools/zip/ZipFile.java b/src/main/org/apache/tools/zip/ZipFile.java index e17184480..661500a7c 100644 --- a/src/main/org/apache/tools/zip/ZipFile.java +++ b/src/main/org/apache/tools/zip/ZipFile.java @@ -23,12 +23,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; +import java.nio.charset.CharacterCodingException; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; import java.util.Map; +import java.util.zip.CRC32; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import java.util.zip.ZipException; @@ -101,6 +103,11 @@ public class ZipFile { */ private RandomAccessFile archive; + /** + * Whether to look for and use Unicode extra fields. + */ + private final boolean useUnicodeExtraFields; + /** * Opens the given file for reading, assuming the platform's * native encoding for file names. @@ -127,7 +134,7 @@ public class ZipFile { /** * Opens the given file for reading, assuming the specified - * encoding for file names. + * encoding for file names and ignoring unicode extra fields. * * @param name name of the archive. * @param encoding the encoding to use for file names @@ -135,7 +142,21 @@ public class ZipFile { * @throws IOException if an error occurs while reading the file. */ public ZipFile(String name, String encoding) throws IOException { - this(new File(name), encoding); + this(new File(name), encoding, false); + } + + /** + * Opens the given file for reading, assuming the specified + * encoding for file names and ignoring unicode extra fields. + * + * @param f the archive. + * @param encoding the encoding to use for file names, use null + * for the platform's default encoding + * + * @throws IOException if an error occurs while reading the file. + */ + public ZipFile(File f, String encoding) throws IOException { + this(f, encoding, false); } /** @@ -144,16 +165,20 @@ public class ZipFile { * * @param f the archive. * @param encoding the encoding to use for file names + * @param whether to use InfoZIP Unicode Extra Fields (if present) + * to set the file names. * * @throws IOException if an error occurs while reading the file. */ - public ZipFile(File f, String encoding) throws IOException { + public ZipFile(File f, String encoding, boolean useUnicodeExtraFields) + throws IOException { this.encoding = encoding; + this.useUnicodeExtraFields = useUnicodeExtraFields; archive = new RandomAccessFile(f, "r"); boolean success = false; try { - populateFromCentralDirectory(); - resolveLocalFileHeaderData(); + Map entriesWithoutEFS = populateFromCentralDirectory(); + resolveLocalFileHeaderData(entriesWithoutEFS); success = true; } finally { if (!success) { @@ -270,9 +295,15 @@ public class ZipFile { *

The ZipEntrys will know all data that can be obtained from * the central directory alone, but not the data that requires the * local file header or additional data to be read.

+ * + * @return a Map<ZipEntry, NameAndComment>> of + * zipentries that didn't have the language encoding flag set when + * read. */ - private void populateFromCentralDirectory() + private Map populateFromCentralDirectory() throws IOException { + HashMap noEFS = new HashMap(); + positionAtCentralDirectory(); byte[] cfh = new byte[CFH_LEN]; @@ -297,10 +328,10 @@ public class ZipFile { off += SHORT; // skip version info final int generalPurposeFlag = ZipShort.getValue(cfh, off); - final String entryEncoding = - (generalPurposeFlag & ZipOutputStream.EFS_FLAG) != 0 - ? ZipOutputStream.UTF8 - : encoding; + final boolean hasEFS = + (generalPurposeFlag & ZipOutputStream.EFS_FLAG) != 0; + final String entryEncoding = + hasEFS ? ZipOutputStream.UTF8 : encoding; off += SHORT; @@ -368,7 +399,12 @@ public class ZipFile { archive.readFully(signatureBytes); sig = ZipLong.getValue(signatureBytes); + + if (!hasEFS && useUnicodeExtraFields) { + noEFS.put(ze, new NameAndComment(fileName, comment)); + } } + return noEFS; } private static final int MIN_EOCD_SIZE = @@ -463,7 +499,7 @@ public class ZipFile { *

Also records the offsets for the data to read from the * entries.

*/ - private void resolveLocalFileHeaderData() + private void resolveLocalFileHeaderData(Map entriesWithoutEFS) throws IOException { Enumeration e = getEntries(); while (e.hasMoreElements()) { @@ -494,6 +530,12 @@ public class ZipFile { */ offsetEntry.dataOffset = offset + LFH_OFFSET_FOR_FILENAME_LENGTH + SHORT + SHORT + fileNameLen + extraFieldLen; + + if (entriesWithoutEFS.containsKey(ze)) { + setNameAndCommentFromExtraFields(ze, + (NameAndComment) + entriesWithoutEFS.get(ze)); + } } } @@ -551,7 +593,11 @@ public class ZipFile { return new String(bytes); } else { try { - return ZipEncodingHelper.decodeName(bytes, enc); + try { + return ZipEncodingHelper.decodeName(bytes, enc); + } catch (CharacterCodingException ex) { + throw new ZipException(ex.getMessage()); + } } catch (java.nio.charset.UnsupportedCharsetException ex) { // Java 1.4's NIO doesn't recognize a few names that // String.getBytes does @@ -580,6 +626,64 @@ public class ZipFile { return true; } + /** + * If the entry has Unicode*ExtraFields and the CRCs of the + * names/comments match those of the extra fields, transfer the + * known Unicode values from the extra field. + */ + private void setNameAndCommentFromExtraFields(ZipEntry ze, + NameAndComment nc) { + UnicodePathExtraField name = (UnicodePathExtraField) + ze.getExtraField(UnicodePathExtraField.UPATH_ID); + String originalName = ze.getName(); + String newName = getUnicodeStringIfOriginalMatches(name, nc.name); + if (newName != null && !originalName.equals(newName)) { + ze.setName(newName); + nameMap.remove(originalName); + nameMap.put(newName, ze); + } + + if (nc.comment != null && nc.comment.length > 0) { + UnicodeCommentExtraField cmt = (UnicodeCommentExtraField) + ze.getExtraField(UnicodeCommentExtraField.UCOM_ID); + String newComment = + getUnicodeStringIfOriginalMatches(cmt, nc.comment); + if (newComment != null) { + ze.setComment(newComment); + } + } + } + + /** + * If the stored CRC matches the one of the given name, return the + * Unicode name of the given field. + * + *

If the field is null or the CRCs don't match, return null + * instead.

+ */ + private String getUnicodeStringIfOriginalMatches(AbstractUnicodeExtraField f, + byte[] orig) { + if (f != null) { + CRC32 crc32 = new CRC32(); + crc32.update(orig); + long origCRC32 = crc32.getValue(); + + if (origCRC32 == f.getNameCRC32()) { + try { + return ZipEncodingHelper + .decodeName(f.getUnicodeName(), ZipOutputStream.UTF8); + } catch (CharacterCodingException ex) { + // UTF-8 unsupported? should be impossible the + // Unicode*ExtraField must contain some bad bytes + + // TODO log this anywhere? + return null; + } + } + } + return null; + } + /** * InputStream that delegates requests to the underlying * RandomAccessFile, making sure that only bytes from a certain @@ -647,4 +751,12 @@ public class ZipFile { } } + private static final class NameAndComment { + private final byte[] name; + private final byte[] comment; + private NameAndComment(byte[] name, byte[] comment) { + this.name = name; + this.comment = comment; + } + } } diff --git a/src/main/org/apache/tools/zip/ZipOutputStream.java b/src/main/org/apache/tools/zip/ZipOutputStream.java index ed5abaf2f..3dd34afd2 100644 --- a/src/main/org/apache/tools/zip/ZipOutputStream.java +++ b/src/main/org/apache/tools/zip/ZipOutputStream.java @@ -260,6 +260,11 @@ public class ZipOutputStream extends FilterOutputStream { */ private boolean useEFS = true; + /** + * whether to create UnicodePathExtraField-s for each entry. + */ + private boolean createUnicodeExtraFields = false; + /** * Creates a new ZIP OutputStream filtering the underlying stream. * @param out the outputstream to zip @@ -335,14 +340,24 @@ public class ZipOutputStream extends FilterOutputStream { } /** - * Whether to set the EFS flag if the file name encoding is UTF-8. + * Whether to set the language encoding flag if the file name + * encoding is UTF-8. * *

Defaults to true.

*/ - public void setUseEFS(boolean b) { + public void setUseLanguageEncodingFlag(boolean b) { useEFS = b && isUTF8(encoding); } + /** + * Whether to create Unicode Extra Fields for all entries. + * + *

Defaults to false.

+ */ + public void setCreateUnicodeExtraFields(boolean b) { + createUnicodeExtraFields = b; + } + /** * Finishs writing the contents and closes this as well as the * underlying stream. @@ -638,6 +653,17 @@ public class ZipOutputStream extends FilterOutputStream { * @since 1.1 */ protected void writeLocalFileHeader(ZipEntry ze) throws IOException { + + byte[] name = getBytes(ze.getName()); + if (createUnicodeExtraFields) { + ze.addExtraField(new UnicodePathExtraField(ze.getName(), name)); + String comm = ze.getComment(); + if (comm != null && !"".equals(comm)) { + byte[] commentB = getBytes(comm); + ze.addExtraField(new UnicodeCommentExtraField(comm, commentB)); + } + } + offsets.put(ze, ZipLong.getBytes(written)); writeOut(LFH_SIG); @@ -675,7 +701,6 @@ public class ZipOutputStream extends FilterOutputStream { // CheckStyle:MagicNumber ON // file name length - byte[] name = getBytes(ze.getName()); writeOut(ZipShort.getBytes(name.length)); written += SHORT;