From 752d64fa43db2172327771ba4d36bee3f2d3b49a Mon Sep 17 00:00:00 2001 From: Stefan Bodewig Date: Mon, 2 Mar 2009 17:17:09 +0000 Subject: [PATCH] improved zip-encoding support for JDK < 1.5, submitted by Wolfgang Glas, merge from commons-compress git-svn-id: https://svn.apache.org/repos/asf/ant/core/trunk@749368 13f79535-47bb-0310-9956-ffa450edef68 --- .../tools/zip/AbstractUnicodeExtraField.java | 36 ++- .../apache/tools/zip/FallbackZipEncoding.java | 94 ++++++ .../org/apache/tools/zip/NioZipEncoding.java | 122 ++++++++ .../tools/zip/Simple8BitZipEncoding.java | 261 +++++++++++++++++ .../tools/zip/UnicodeCommentExtraField.java | 17 +- .../tools/zip/UnicodePathExtraField.java | 16 +- .../org/apache/tools/zip/ZipEncoding.java | 85 ++++++ .../apache/tools/zip/ZipEncodingHelper.java | 274 +++++++++++------- src/main/org/apache/tools/zip/ZipFile.java | 67 ++--- .../org/apache/tools/zip/ZipOutputStream.java | 123 ++++---- .../apache/tools/zip/UTF8ZipFilesTest.java | 144 ++++++--- .../org/apache/tools/zip/ZipEncodingTest.java | 147 ++++++++++ 12 files changed, 1097 insertions(+), 289 deletions(-) create mode 100644 src/main/org/apache/tools/zip/FallbackZipEncoding.java create mode 100644 src/main/org/apache/tools/zip/NioZipEncoding.java create mode 100644 src/main/org/apache/tools/zip/Simple8BitZipEncoding.java create mode 100644 src/main/org/apache/tools/zip/ZipEncoding.java create mode 100644 src/tests/junit/org/apache/tools/zip/ZipEncodingTest.java diff --git a/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java b/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java index 49d998bcf..3ba2e0ea5 100644 --- a/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java +++ b/src/main/org/apache/tools/zip/AbstractUnicodeExtraField.java @@ -38,11 +38,25 @@ public abstract class AbstractUnicodeExtraField implements ZipExtraField { * encoding of the orginal zip entry. * * @param text The file name or comment. - * @param zipEncoding The encoding of the filenames in the zip - * file, usually "CP437". + * @param bytes The encoded of the filename or comment in the zip + * file. + * @param off The offset of the encoded filename or comment in + * bytes. + * @param len The length of the encoded filename or commentin + * bytes. */ - protected AbstractUnicodeExtraField(String text, String zipEncoding) { - this(text, ZipEncodingHelper.encodeName(text, zipEncoding)); + protected AbstractUnicodeExtraField(String text, byte[] bytes, int off, + int len) { + CRC32 crc32 = new CRC32(); + crc32.update(bytes, off, len); + nameCRC32 = crc32.getValue(); + + try { + unicodeName = text.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("FATAL: UTF-8 encoding not supported.", + e); + } } /** @@ -50,20 +64,12 @@ public abstract class AbstractUnicodeExtraField implements ZipExtraField { * encoding of the orginal zip entry. * * @param text The file name or comment. - * @param zipEncoding The encoding of the filenames in the zip - * file, usually "CP437". + * @param bytes The encoded of the filename or comment in the zip + * file. */ protected AbstractUnicodeExtraField(String text, byte[] bytes) { - CRC32 crc32 = new CRC32(); - crc32.update(bytes); - nameCRC32 = crc32.getValue(); - try { - unicodeName = text.getBytes("UTF-8"); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException("FATAL: UTF-8 encoding not supported.", - e); - } + this(text, bytes, 0, bytes.length); } private void assembleData() { diff --git a/src/main/org/apache/tools/zip/FallbackZipEncoding.java b/src/main/org/apache/tools/zip/FallbackZipEncoding.java new file mode 100644 index 000000000..4c2fe8500 --- /dev/null +++ b/src/main/org/apache/tools/zip/FallbackZipEncoding.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tools.zip; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * A fallback ZipEncoding, which uses a java.io means to encode names. + * + *

This implementation is not favorable for encodings other than + * utf-8, because java.io encodes unmappable character as question + * marks leading to unreadable ZIP entries on some operating + * systems.

+ * + *

Furthermore this implementation is unable to tell, whether a + * given name can be safely encoded or not.

+ * + *

This implementation acts as a last resort implementation, when + * neither {@see Simple8BitZipEnoding} nor {@see NioZipEncoding} is + * available.

+ * + *

The methods of this class are reentrant.

+ */ +class FallbackZipEncoding implements ZipEncoding { + private final String charset; + + /** + * Construct a fallback zip encoding, which uses the platform's + * default charset. + */ + public FallbackZipEncoding() { + this.charset = null; + } + + /** + * Construct a fallback zip encoding, which uses the given charset. + * + * @param charset The name of the charset or null for + * the platform's default character set. + */ + public FallbackZipEncoding(String charset) { + this.charset = charset; + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#canEncode(java.lang.String) + */ + public boolean canEncode(String name) { + return true; + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#encode(java.lang.String) + */ + public ByteBuffer encode(String name) throws IOException { + if (this.charset == null) { + return ByteBuffer.wrap(name.getBytes()); + } else { + return ByteBuffer.wrap(name.getBytes(this.charset)); + } + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#decode(byte[]) + */ + public String decode(byte[] data) throws IOException { + if (this.charset == null) { + return new String(data); + } else { + return new String(data,this.charset); + } + } +} diff --git a/src/main/org/apache/tools/zip/NioZipEncoding.java b/src/main/org/apache/tools/zip/NioZipEncoding.java new file mode 100644 index 000000000..a6870ca8f --- /dev/null +++ b/src/main/org/apache/tools/zip/NioZipEncoding.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tools.zip; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +/** + * A ZipEncoding, which uses a java.nio {@link + * java.nio.charset.Charset Charset} to encode names. + * + *

This implementation works for all cases under java-1.5 or + * later. However, in java-1.4, some charsets don't have a java-nio + * implementation, most notably the default ZIP encoding Cp437.

+ * + *

The methods of this class are reentrant.

+ */ +class NioZipEncoding implements ZipEncoding { + private final Charset charset; + + /** + * Construct an NIO based zip encoding, which wraps the given + * charset. + * + * @param charset The NIO charset to wrap. + */ + public NioZipEncoding(Charset charset) { + this.charset = charset; + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#canEncode(java.lang.String) + */ + public boolean canEncode(String name) { + CharsetEncoder enc = this.charset.newEncoder(); + enc.onMalformedInput(CodingErrorAction.REPORT); + enc.onUnmappableCharacter(CodingErrorAction.REPORT); + + return enc.canEncode(name); + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#encode(java.lang.String) + */ + public ByteBuffer encode(String name) { + CharsetEncoder enc = this.charset.newEncoder(); + + enc.onMalformedInput(CodingErrorAction.REPORT); + enc.onUnmappableCharacter(CodingErrorAction.REPORT); + + CharBuffer cb = CharBuffer.wrap(name); + ByteBuffer out = ByteBuffer.allocate(name.length() + + (name.length() + 1) / 2); + + while (cb.remaining() > 0) { + CoderResult res = enc.encode(cb, out,true); + + if (res.isUnmappable() || res.isMalformed()) { + + // write the unmappable characters in utf-16 + // pseudo-URL encoding style to ByteBuffer. + if (res.length() * 6 > out.remaining()) { + out = ZipEncodingHelper.growBuffer(out, out.position() + + res.length() * 6); + } + + for (int i=0; i + *
  • Characters 0x0000 to 0x007f are encoded as the corresponding + * byte values 0x00 to 0x7f.
  • + *
  • All byte codes from 0x80 to 0xff are mapped to a unique unicode + * character in the range 0x0080 to 0x7fff. (No support for + * UTF-16 surrogates) + * + * + *

    These restrictions most notably apply to the most prominent + * omissions of java-1.4's {@link java.nio.charset.Charset Charset} + * implementation, Cp437 and Cp850.

    + * + *

    The methods of this class are reentrant.

    + */ +class Simple8BitZipEncoding implements ZipEncoding { + + /** + * A character entity, which is put to the reverse mapping table + * of a simple encoding. + */ + private static final class Simple8BitChar implements Comparable { + public final char unicode; + public final byte code; + + Simple8BitChar(byte code, char unicode) { + this.code = code; + this.unicode = unicode; + } + + public int compareTo(Object o) { + Simple8BitChar a = (Simple8BitChar) o; + + return this.unicode - a.unicode; + } + + public String toString() { + return "0x" + Integer.toHexString(0xffff & (int) unicode) + + "->0x" + Integer.toHexString(0xff & (int) code); + } + } + + /** + * The characters for byte values of 128 to 255 stored as an array of + * 128 chars. + */ + private final char[] highChars; + + /** + * A list of {@see Simple8BitChar} objects sorted by the unicode + * field. This list is used to binary search reverse mapping of + * unicode characters with a character code greater than 127. + */ + private final List reverseMapping; + + /** + * @param highChars The characters for byte values of 128 to 255 + * stored as an array of 128 chars. + */ + public Simple8BitZipEncoding(char[] highChars) { + this.highChars = highChars; + this.reverseMapping = new ArrayList(this.highChars.length); + + byte code = 127; + + for (int i = 0; i < this.highChars.length; ++i) { + this.reverseMapping.add(new Simple8BitChar(++code, + this.highChars[i])); + } + + Collections.sort(this.reverseMapping); + } + + /** + * Return the character code for a given encoded byte. + * + * @param b The byte to decode. + * @return The associated character value. + */ + public char decodeByte(byte b) { + // code 0-127 + if (b >= 0) { + return (char) b; + } + + // byte is signed, so 128 == -128 and 255 == -1 + return this.highChars[128 + (int) b]; + } + + /** + * @param c The character to encode. + * @return Whether the given unicode character is covered by this encoding. + */ + public boolean canEncodeChar(char c) { + + if (c >= 0 && c < 128) { + return true; + } + + Simple8BitChar r = this.encodeHighChar(c); + return r != null; + } + + /** + * Pushes the encoded form of the given character to the given byte buffer. + * + * @param bb The byte buffer to write to. + * @param c The character to encode. + * @return Whether the given unicode character is covered by this encoding. + * If false is returned, nothing is pushed to the + * byte buffer. + */ + public boolean pushEncodedChar(ByteBuffer bb, char c) { + + if (c >= 0 && c < 128) { + bb.put((byte) c); + return true; + } + + Simple8BitChar r = this.encodeHighChar(c); + if (r == null) { + return false; + } + bb.put(r.code); + return true; + } + + /** + * @param c A unicode character in the range from 0x0080 to 0x7f00 + * @return A Simple8BitChar, if this character is covered by this encoding. + * A null value is returned, if this character is not + * covered by this encoding. + */ + private Simple8BitChar encodeHighChar(char c) { + // for performance an simplicity, yet another reincarnation of + // binary search... + int i0 = 0; + int i1 = this.reverseMapping.size(); + + while (i1 > i0) { + + int i = i0 + (i1 - i0) / 2; + + Simple8BitChar m = (Simple8BitChar) this.reverseMapping.get(i); + + if (m.unicode == c) { + return m; + } + + if (m.unicode < c) { + i0 = i + 1; + } else { + i1 = i; + } + } + + if (i0 >= this.reverseMapping.size()) { + return null; + } + + Simple8BitChar r = (Simple8BitChar) this.reverseMapping.get(i0); + + if (r.unicode != c) { + return null; + } + + return r; + } + + /** + * @see + * org.apache.tools.zip.ZipEncoding#canEncode(java.lang.String) + */ + public boolean canEncode(String name) { + + for (int i=0;i"CP437". + * @param name The file name + * @param bytes the bytes actually written to the archive + * @param off The offset of the encoded comment in bytes. + * @param len The length of the encoded comment or comment in + * bytes. */ - public UnicodeCommentExtraField(String comment, String zipEncoding) { - super(comment, zipEncoding); + public UnicodeCommentExtraField(String text, byte[] bytes, int off, + int len) { + super(text, bytes, off, len); } /** diff --git a/src/main/org/apache/tools/zip/UnicodePathExtraField.java b/src/main/org/apache/tools/zip/UnicodePathExtraField.java index dd3359a3d..8c26e1557 100644 --- a/src/main/org/apache/tools/zip/UnicodePathExtraField.java +++ b/src/main/org/apache/tools/zip/UnicodePathExtraField.java @@ -42,20 +42,22 @@ public class UnicodePathExtraField extends AbstractUnicodeExtraField { } /** - * Assemble as unicode path extension from the name and encoding - * of the orginal zip entry. + * Assemble as unicode path extension from the name given as + * text as well as the encoded bytes actually written to the archive. * * @param name The file name - * @param zipEncoding The encoding of the filename in the zip - * file, usually "CP437". + * @param bytes the bytes actually written to the archive + * @param off The offset of the encoded filename in bytes. + * @param len The length of the encoded filename or comment in + * bytes. */ - public UnicodePathExtraField(String name, String zipEncoding) { - super(name, zipEncoding); + public UnicodePathExtraField(String text, byte[] bytes, int off, int len) { + super(text, bytes, off, len); } /** * Assemble as unicode path extension from the name given as - * text as well as the bytes actually written to the archive. + * text as well as the encoded bytes actually written to the archive. * * @param name The file name * @param bytes the bytes actually written to the archive diff --git a/src/main/org/apache/tools/zip/ZipEncoding.java b/src/main/org/apache/tools/zip/ZipEncoding.java new file mode 100644 index 000000000..5dc88e4b4 --- /dev/null +++ b/src/main/org/apache/tools/zip/ZipEncoding.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tools.zip; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; + +/** + * An interface for encoders that do a pretty encoding of ZIP + * filenames. + * + *

    There are mostly two implementations, one that uses java.nio + * {@link java.nio.charset.Charset Charset} and one implementation, + * which copes with simple 8 bit charsets, because java-1.4 did not + * support Cp437 in java.nio.

    + * + *

    The main reason for defining an own encoding layer comes from + * the problems with {@link java.lang.String#getBytes(String) + * String.getBytes}, which encodes unknown characters as ASCII + * quotation marks ('?'), which is per definition an invalid filename + * character under some operating systems (Windows, e.g.) leading to + * ignored ZIP entries.

    + * + *

    All implementations should implement this interface in a + * reentrant way.<(p> + */ +interface ZipEncoding { + /** + * Check, whether the given string may be losslessly encoded using this + * encoding. + * + * @param name A filename or ZIP comment. + * @return Whether the given name may be encoded with out any losses. + */ + boolean canEncode(String name); + + /** + * Encode a filename or a comment to a byte array suitable for + * storing it to a serialized zip entry. + * + *

    Examples for CP 437 (in pseudo-notation, right hand side is + * C-style notation):

    + *
    +     *  encode("\u20AC_for_Dollar.txt") = "%U20AC_for_Dollar.txt"
    +     *  encode("\u00D6lf\u00E4sser.txt") = "\231lf\204sser.txt"
    +     * 
    + * + * @param name A filename or ZIP comment. + * @return A byte buffer with a backing array containing the + * encoded name. Unmappable characters or malformed + * character sequences are mapped to a sequence of utf-16 + * words encoded in the format %Uxxxx. It is + * assumed, that the byte buffer is positioned at the + * beinning of the encoded result, the byte buffer has a + * backing array and the limit of the byte buffer points + * to the end of the encoded result. + * @throws IOException + */ + ByteBuffer encode(String name) throws IOException; + + /** + * @param data The byte values to decode. + * @return The decoded string. + * @throws IOException + */ + String decode(byte [] data) throws IOException; +} diff --git a/src/main/org/apache/tools/zip/ZipEncodingHelper.java b/src/main/org/apache/tools/zip/ZipEncodingHelper.java index 1b5d16e97..e09327341 100644 --- a/src/main/org/apache/tools/zip/ZipEncodingHelper.java +++ b/src/main/org/apache/tools/zip/ZipEncodingHelper.java @@ -19,17 +19,119 @@ package org.apache.tools.zip; import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; +import java.nio.charset.UnsupportedCharsetException; +import java.util.HashMap; +import java.util.Map; /** * Static helper functions for robustly encoding filenames in zip files. */ abstract class ZipEncodingHelper { + /** + * A class, which holds the high characters of a simple encoding + * and lazily instantiates a Simple8BitZipEncoding instance in a + * thread-safe manner. + */ + private static class SimpleEncodingHolder { + + private final char [] highChars; + private Simple8BitZipEncoding encoding; + + /** + * Instantiate a simple encoding holder. + * + * @param highChars The characters for byte codes 128 to 255. + * + * @see Simple8BitZipEncoding#Simple8BitZipEncoding(char[]) + */ + SimpleEncodingHolder(char [] highChars) { + this.highChars = highChars; + } + + /** + * @return The associated {@see Simple8BitZipEncoding}, which + * is instantiated if not done so far. + */ + public synchronized Simple8BitZipEncoding getEncoding() { + if (this.encoding == null) { + this.encoding = new Simple8BitZipEncoding(this.highChars); + } + return this.encoding; + } + } + + private static final Map simpleEncodings; + + static { + simpleEncodings = new HashMap(); + + char[] cp437_high_chars = + new char[] { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, + 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, + 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, + 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, + 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, + 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, 0x2534, + 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, + 0x256c, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, + 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, + 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, + 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, + 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, + 0x25a0, 0x00a0 }; + + SimpleEncodingHolder cp437 = new SimpleEncodingHolder(cp437_high_chars); + + simpleEncodings.put("CP437",cp437); + simpleEncodings.put("Cp437",cp437); + simpleEncodings.put("cp437",cp437); + simpleEncodings.put("IBM437",cp437); + simpleEncodings.put("ibm437",cp437); + + char[] cp850_high_chars = + new char[] { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, + 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, + 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, + 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, + 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, + 0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, + 0x255d, 0x00a2, 0x00a5, 0x2510, 0x2514, 0x2534, + 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, + 0x256c, 0x00a4, 0x00f0, 0x00d0, 0x00ca, 0x00cb, + 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518, + 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, + 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, + 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9, + 0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, + 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, + 0x25a0, 0x00a0 }; + + SimpleEncodingHolder cp850 = new SimpleEncodingHolder(cp850_high_chars); + + simpleEncodings.put("CP850",cp850); + simpleEncodings.put("Cp850",cp850); + simpleEncodings.put("cp850",cp850); + simpleEncodings.put("IBM850",cp850); + simpleEncodings.put("ibm850",cp850); + } + /** * Grow a byte buffer, so it has a minimal capacity or at least * the double capacity of the original buffer @@ -53,7 +155,7 @@ abstract class ZipEncodingHelper { return on; } - + /** * The hexadecimal digits 0,...,9,A,...,F encoded as * ASCII bytes. @@ -65,131 +167,79 @@ abstract class ZipEncodingHelper { }; /** - * Encode a filename or a comment to a byte array suitable for - * storing it to a serialized zip entry. + * Append %Uxxxx to the given byte buffer. + * The caller must assure, that bb.remaining()>=6. * - * Examples (in pseudo-notation, right hand side is C-style notation): - *
    -     *  encodeName("\u20AC_for_Dollar.txt","CP437") = "%U20AC_for_Dollar.txt"
    -     *  encodeName("\u00D6lf\u00E4sser.txt","CP437") = "\231lf\204sser.txt"
    -     * 
    - * - * @param name The filename or comment with possible non-ASCII - * unicode characters. Must not be null. - * @param encoding A valid encoding name. The standard zip - * encoding is "CP437", - * "UTF-8" is supported in ZIP file - * version 6.3 or later. If null, - * will use the platform's {@link - * java.lang.String#getBytes default encoding}. - * @return A byte array containing the mapped file - * name. Unmappable characters or malformed character - * sequences are mapped to a sequence of utf-16 words - * encoded in the format %Uxxxx. + * @param bb The byte buffer to write to. + * @param c The character to write. */ - static final byte[] encodeName(String name, String encoding) { - if (encoding == null) { - return name.getBytes(); - } - - Charset cs = Charset.forName(encoding); - CharsetEncoder enc = cs.newEncoder(); - - enc.onMalformedInput(CodingErrorAction.REPORT); - enc.onUnmappableCharacter(CodingErrorAction.REPORT); - - CharBuffer cb = CharBuffer.wrap(name); - ByteBuffer out = ByteBuffer.allocate(name.length() - + (name.length() + 1) / 2); - - while (cb.remaining() > 0) { - CoderResult res = enc.encode(cb, out,true); - - if (res.isUnmappable() || res.isMalformed()) { - - // write the unmappable characters in utf-16 - // pseudo-URL encoding style to ByteBuffer. - if (res.length() * 6 > out.remaining()) { - out = growBuffer(out,out.position() + res.length() * 6); - } - - for (int i=0; i> 12)&0x0f]); - out.put(HEX_DIGITS[(c >> 8)&0x0f]); - out.put(HEX_DIGITS[(c >> 4)&0x0f]); - out.put(HEX_DIGITS[c & 0x0f]); - } + bb.put(HEX_DIGITS[(c >> 12)&0x0f]); + bb.put(HEX_DIGITS[(c >> 8)&0x0f]); + bb.put(HEX_DIGITS[(c >> 4)&0x0f]); + bb.put(HEX_DIGITS[c & 0x0f]); + } - } else if (res.isOverflow()) { - out = growBuffer(out, 0); + /** + * name of the encoding UTF-8 + */ + static final String UTF8 = "UTF8"; - } else if (res.isUnderflow()) { + /** + * name of the encoding UTF-8 + */ + static final ZipEncoding UTF8_ZIP_ENCODING = new FallbackZipEncoding(UTF8); - enc.flush(out); - break; + /** + * Instantiates a zip encoding. + * + * @param name The name of the zip encoding. Specify null for + * the platform's default encoding. + * @return A zip encoding for the given encoding name. + */ + static ZipEncoding getZipEncoding(String name) { + + // fallback encoding is good enough for utf-8. + if (isUTF8(name)) { + return UTF8_ZIP_ENCODING; + } - } + if (name == null) { + return new FallbackZipEncoding(); } - byte [] ret = new byte[out.position()]; - out.rewind(); - out.get(ret); + SimpleEncodingHolder h = + (SimpleEncodingHolder) simpleEncodings.get(name); - return ret; - } - - /** - * Return, whether a filename or a comment may be encoded to a - * byte array suitable for storing it to a serialized zip entry - * without any losses. - * - * Examples (in pseudo-notation, right hand side is C-style notation): - *
    -     *  canEncodeName("\u20AC_for_Dollar.txt","CP437") = false
    -     *  canEncodeName("\u20AC_for_Dollar.txt","UTF-8") = true
    -     *  canEncodeName("\u00D6lf\u00E4sser.txt","CP437") = true
    -     * 
    - * - * @param name The filename or comment with possible non-ASCII - * unicode characters. - * @param encoding A valid encoding name. The standard zip - * encoding is "CP437", - * "UTF-8" is supported in ZIP file - * version 6.3 or later. - * @return Whether the given encoding may encode the given name. - */ - static final boolean canEncodeName(String name, String encoding) { + if (h!=null) { + return h.getEncoding(); + } - Charset cs = Charset.forName(encoding); + try { - CharsetEncoder enc = cs.newEncoder(); - enc.onMalformedInput(CodingErrorAction.REPORT); - enc.onUnmappableCharacter(CodingErrorAction.REPORT); + Charset cs = Charset.forName(name); + return new NioZipEncoding(cs); - return enc.canEncode(name); + } catch (UnsupportedCharsetException e) { + return new FallbackZipEncoding(name); + } } /** - * Decode a filename or a comment from a byte array. - * - * @param name The filename or comment. - * @param encoding A valid encoding name. The standard zip - * encoding is "CP437", - * "UTF-8" is supported in ZIP file - * version 6.3 or later. + * Whether a given encoding - or the platform's default encoding + * if the parameter is null - is UTF-8. */ - static final String decodeName(byte[] name, String encoding) - throws java.nio.charset.CharacterCodingException { - Charset cs = Charset.forName(encoding); - return cs.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) - .decode(ByteBuffer.wrap(name)).toString(); + static boolean isUTF8(String encoding) { + if (encoding == null) { + // check platform's default encoding + encoding = System.getProperty("file.encoding"); + } + return UTF8.equalsIgnoreCase(encoding) + || "utf-8".equalsIgnoreCase(encoding); } } diff --git a/src/main/org/apache/tools/zip/ZipFile.java b/src/main/org/apache/tools/zip/ZipFile.java index bd83f6c6f..bd6a47df1 100644 --- a/src/main/org/apache/tools/zip/ZipFile.java +++ b/src/main/org/apache/tools/zip/ZipFile.java @@ -22,8 +22,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; -import java.io.UnsupportedEncodingException; -import java.nio.charset.CharacterCodingException; import java.util.Calendar; import java.util.Collections; import java.util.Date; @@ -98,6 +96,11 @@ public class ZipFile { */ private String encoding = null; + /** + * The zip encoding to use for filenames and the file comment. + */ + private final ZipEncoding zipEncoding; + /** * The actual data source. */ @@ -164,15 +167,17 @@ public class ZipFile { * encoding for file names. * * @param f the archive. - * @param encoding the encoding to use for file names - * @param whether to use InfoZIP Unicode Extra Fields (if present) - * to set the file names. + * @param encoding the encoding to use for file names, use null + * for the platform's default encoding + * @param useUnicodeExtraFields whether to use InfoZIP Unicode + * Extra Fields (if present) to set the file names. * * @throws IOException if an error occurs while reading the file. */ public ZipFile(File f, String encoding, boolean useUnicodeExtraFields) throws IOException { this.encoding = encoding; + this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); this.useUnicodeExtraFields = useUnicodeExtraFields; archive = new RandomAccessFile(f, "r"); boolean success = false; @@ -247,7 +252,8 @@ public class ZipFile { * @param ze the entry to get the stream for. * @return a stream to read the entry from. * @throws IOException if unable to create an input stream from the zipenty - * @throws ZipException if the zipentry has an unsupported compression method + * @throws ZipException if the zipentry has an unsupported + * compression method */ public InputStream getInputStream(ZipEntry ze) throws IOException, ZipException { @@ -330,8 +336,8 @@ public class ZipFile { final int generalPurposeFlag = ZipShort.getValue(cfh, off); final boolean hasEFS = (generalPurposeFlag & ZipOutputStream.EFS_FLAG) != 0; - final String entryEncoding = - hasEFS ? ZipOutputStream.UTF8 : encoding; + final ZipEncoding entryEncoding = + hasEFS ? ZipEncodingHelper.UTF8_ZIP_ENCODING : zipEncoding; off += SHORT; @@ -373,7 +379,7 @@ public class ZipFile { byte[] fileName = new byte[fileNameLen]; archive.readFully(fileName); - ze.setName(getString(fileName, entryEncoding)); + ze.setName(entryEncoding.decode(fileName)); // LFH offset, OffsetEntry offset = new OffsetEntry(); @@ -395,7 +401,7 @@ public class ZipFile { byte[] comment = new byte[commentLen]; archive.readFully(comment); - ze.setComment(getString(comment, entryEncoding)); + ze.setComment(entryEncoding.decode(comment)); archive.readFully(signatureBytes); sig = ZipLong.getValue(signatureBytes); @@ -529,7 +535,7 @@ public class ZipFile { + SHORT + SHORT + fileNameLen + extraFieldLen)); */ offsetEntry.dataOffset = offset + LFH_OFFSET_FOR_FILENAME_LENGTH - + SHORT + SHORT + fileNameLen + extraFieldLen; + + SHORT + SHORT + fileNameLen + extraFieldLen; if (entriesWithoutEFS.containsKey(ze)) { setNameAndCommentFromExtraFields(ze, @@ -576,37 +582,10 @@ public class ZipFile { * @throws ZipException if the encoding cannot be recognized. */ protected String getString(byte[] bytes) throws ZipException { - return getString(bytes, encoding); - } - - /** - * Retrieve a String from the given bytes using the encoding set - * for this ZipFile. - * - * @param bytes the byte array to transform - * @return String obtained by using the given encoding - * @throws ZipException if the encoding cannot be recognized. - */ - protected String getString(byte[] bytes, String enc) - throws ZipException { - if (enc == null) { - return new String(bytes); - } else { - try { - try { - return ZipEncodingHelper.decodeName(bytes, enc); - } catch (CharacterCodingException ex) { - throw new ZipException(ex.getMessage()); - } - } catch (java.nio.charset.UnsupportedCharsetException ex) { - // Java 1.4's NIO doesn't recognize a few names that - // String.getBytes does - try { - return new String(bytes, enc); - } catch (UnsupportedEncodingException uee) { - throw new ZipException(uee.getMessage()); - } - } + try { + return ZipEncodingHelper.getZipEncoding(encoding).decode(bytes); + } catch (IOException ex) { + throw new ZipException("Failed to decode name: " + ex.getMessage()); } } @@ -671,8 +650,8 @@ public class ZipFile { if (origCRC32 == f.getNameCRC32()) { try { return ZipEncodingHelper - .decodeName(f.getUnicodeName(), ZipOutputStream.UTF8); - } catch (CharacterCodingException ex) { + .UTF8_ZIP_ENCODING.decode(f.getUnicodeName()); + } catch (IOException ex) { // UTF-8 unsupported? should be impossible the // Unicode*ExtraField must contain some bad bytes diff --git a/src/main/org/apache/tools/zip/ZipOutputStream.java b/src/main/org/apache/tools/zip/ZipOutputStream.java index 3dd34afd2..cad9f9a95 100644 --- a/src/main/org/apache/tools/zip/ZipOutputStream.java +++ b/src/main/org/apache/tools/zip/ZipOutputStream.java @@ -24,7 +24,7 @@ import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.RandomAccessFile; -import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -92,9 +92,9 @@ public class ZipOutputStream extends FilterOutputStream { public static final int STORED = java.util.zip.ZipEntry.STORED; /** - * name of the encoding UTF-8 + * default encoding for file names and comment. */ - static final String UTF8 = "UTF8"; + static final String DEFAULT_ENCODING = null; /** * General purpose flag, which indicates that filenames are @@ -220,7 +220,16 @@ public class ZipOutputStream extends FilterOutputStream { */ private String encoding = null; - // CheckStyle:VisibilityModifier OFF - bc + /** + * The zip encoding to use for filenames and the file comment. + * + * This field is of internal use and will be set in {@link + * #setEncoding(String)}. + */ + private ZipEncoding zipEncoding = + ZipEncodingHelper.getZipEncoding(DEFAULT_ENCODING); + + // CheckStyle:VisibilityModifier OFF - bc /** * This Deflater object is used for output. @@ -301,8 +310,8 @@ public class ZipOutputStream extends FilterOutputStream { } /** - * This method indicates whether this archive is writing to a seekable stream (i.e., to a random - * access file). + * This method indicates whether this archive is writing to a + * seekable stream (i.e., to a random access file). * *

    For seekable streams, you don't need to calculate the CRC or * uncompressed size for {@link #STORED} entries before @@ -325,7 +334,8 @@ public class ZipOutputStream extends FilterOutputStream { */ public void setEncoding(final String encoding) { this.encoding = encoding; - useEFS &= isUTF8(encoding); + this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); + useEFS &= ZipEncodingHelper.isUTF8(encoding); } /** @@ -346,7 +356,7 @@ public class ZipOutputStream extends FilterOutputStream { *

    Defaults to true.

    */ public void setUseLanguageEncodingFlag(boolean b) { - useEFS = b && isUTF8(encoding); + useEFS = b && ZipEncodingHelper.isUTF8(encoding); } /** @@ -499,14 +509,15 @@ public class ZipOutputStream extends FilterOutputStream { * *

    Default is Deflater.DEFAULT_COMPRESSION.

    * @param level the compression level. - * @throws IllegalArgumentException if an invalid compression level is specified. + * @throws IllegalArgumentException if an invalid compression + * level is specified. * @since 1.1 */ public void setLevel(int level) { if (level < Deflater.DEFAULT_COMPRESSION || level > Deflater.BEST_COMPRESSION) { - throw new IllegalArgumentException( - "Invalid compression level: " + level); + throw new IllegalArgumentException("Invalid compression level: " + + level); } hasCompressionLevelChanged = (this.level != level); this.level = level; @@ -654,13 +665,31 @@ public class ZipOutputStream extends FilterOutputStream { */ protected void writeLocalFileHeader(ZipEntry ze) throws IOException { - byte[] name = getBytes(ze.getName()); + boolean encodable = this.zipEncoding.canEncode(ze.getName()); + ByteBuffer name = this.zipEncoding.encode(ze.getName()); + if (createUnicodeExtraFields) { - ze.addExtraField(new UnicodePathExtraField(ze.getName(), name)); + + /* if (!encodable) { -- FIXME decide what to*/ + ze.addExtraField(new UnicodePathExtraField(ze.getName(), + name.array(), + name.arrayOffset(), + name.limit())); + /* } */ + String comm = ze.getComment(); if (comm != null && !"".equals(comm)) { - byte[] commentB = getBytes(comm); - ze.addExtraField(new UnicodeCommentExtraField(comm, commentB)); + + boolean commentEncodable = this.zipEncoding.canEncode(comm); + + /* if (!commentEncodable) { -- FIXME decide what to*/ + ByteBuffer commentB = this.zipEncoding.encode(comm); + ze.addExtraField(new UnicodeCommentExtraField(comm, + commentB.array(), + commentB.arrayOffset(), + commentB.limit()) + ); + /* } */ } } @@ -701,7 +730,7 @@ public class ZipOutputStream extends FilterOutputStream { // CheckStyle:MagicNumber ON // file name length - writeOut(ZipShort.getBytes(name.length)); + writeOut(ZipShort.getBytes(name.limit())); written += SHORT; // extra field length @@ -710,8 +739,8 @@ public class ZipOutputStream extends FilterOutputStream { written += SHORT; // file name - writeOut(name); - written += name.length; + writeOut(name.array(), name.arrayOffset(), name.limit()); + written += name.limit(); // extra field writeOut(extra); @@ -779,8 +808,8 @@ public class ZipOutputStream extends FilterOutputStream { // CheckStyle:MagicNumber ON // file name length - byte[] name = getBytes(ze.getName()); - writeOut(ZipShort.getBytes(name.length)); + ByteBuffer name = this.zipEncoding.encode(ze.getName()); + writeOut(ZipShort.getBytes(name.limit())); written += SHORT; // extra field length @@ -793,8 +822,8 @@ public class ZipOutputStream extends FilterOutputStream { if (comm == null) { comm = ""; } - byte[] commentB = getBytes(comm); - writeOut(ZipShort.getBytes(commentB.length)); + ByteBuffer commentB = this.zipEncoding.encode(comm); + writeOut(ZipShort.getBytes(commentB.limit())); written += SHORT; // disk number start @@ -814,16 +843,16 @@ public class ZipOutputStream extends FilterOutputStream { written += WORD; // file name - writeOut(name); - written += name.length; + writeOut(name.array(), name.arrayOffset(), name.limit()); + written += name.limit(); // extra field writeOut(extra); written += extra.length; // file comment - writeOut(commentB); - written += commentB.length; + writeOut(commentB.array(), commentB.arrayOffset(), commentB.limit()); + written += commentB.limit(); } /** @@ -849,9 +878,9 @@ public class ZipOutputStream extends FilterOutputStream { writeOut(ZipLong.getBytes(cdOffset)); // ZIP file comment - byte[] data = getBytes(comment); - writeOut(ZipShort.getBytes(data.length)); - writeOut(data); + ByteBuffer data = this.zipEncoding.encode(comment); + writeOut(ZipShort.getBytes(data.limit())); + writeOut(data.array(), data.arrayOffset(), data.limit()); } /** @@ -908,20 +937,15 @@ public class ZipOutputStream extends FilterOutputStream { * @since 1.3 */ protected byte[] getBytes(String name) throws ZipException { - if (encoding == null) { - return name.getBytes(); - } else { - try { - return ZipEncodingHelper.encodeName(name, encoding); - } catch (java.nio.charset.UnsupportedCharsetException ex) { - // Java 1.4's NIO doesn't recognize a few names that - // String.getBytes does - try { - return name.getBytes(encoding); - } catch (UnsupportedEncodingException uee) { - throw new ZipException(uee.getMessage()); - } - } + try { + ByteBuffer b = + ZipEncodingHelper.getZipEncoding(encoding).encode(name); + byte[] result = new byte[b.limit()]; + System.arraycopy(b.array(), b.arrayOffset(), result, 0, + result.length); + return result; + } catch (IOException ex) { + throw new ZipException("Failed to encode name: " + ex.getMessage()); } } @@ -975,19 +999,6 @@ public class ZipOutputStream extends FilterOutputStream { } } - /** - * Whether a given encoding - or the platform's default encoding - * if the parameter is null - is UTF-8. - */ - static boolean isUTF8(String encoding) { - if (encoding == null) { - // check platform's default encoding - encoding = System.getProperty("file.encoding"); - } - return UTF8.equalsIgnoreCase(encoding) - || "utf-8".equalsIgnoreCase(encoding); - } - private void writeVersionNeededToExtractAndGeneralPurposeBits(final int zipMethod) throws IOException { diff --git a/src/tests/junit/org/apache/tools/zip/UTF8ZipFilesTest.java b/src/tests/junit/org/apache/tools/zip/UTF8ZipFilesTest.java index bb8246fe5..3e7edb1dd 100644 --- a/src/tests/junit/org/apache/tools/zip/UTF8ZipFilesTest.java +++ b/src/tests/junit/org/apache/tools/zip/UTF8ZipFilesTest.java @@ -19,12 +19,14 @@ package org.apache.tools.zip; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; -import java.nio.charset.Charset; -import java.nio.charset.UnsupportedCharsetException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.ByteBuffer; import java.util.Enumeration; +import java.util.zip.CRC32; import junit.framework.TestCase; public class UTF8ZipFilesTest extends TestCase { @@ -36,33 +38,70 @@ public class UTF8ZipFilesTest extends TestCase { private static final String EURO_FOR_DOLLAR_TXT = "\u20AC_for_Dollar.txt"; private static final String OIL_BARREL_TXT = "\u00D6lf\u00E4sser.txt"; - public void testUtf8FileRoundtrip() throws IOException { - testFileRoundtrip(UTF_8); + public void testUtf8FileRoundtripExplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(UTF_8, true, true); + } + + public void testUtf8FileRoundtripNoEFSExplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(UTF_8, false, true); + } + + public void testCP437FileRoundtripExplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(CP437, false, true); + } + + public void testASCIIFileRoundtripExplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(US_ASCII, false, true); } + public void testUtf8FileRoundtripImplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(UTF_8, true, false); + } - public void testCP437FileRoundtrip() throws IOException { - testFileRoundtrip(CP437); + public void testUtf8FileRoundtripNoEFSImplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(UTF_8, false, false); } - public void testASCIIFileRoundtrip() throws IOException { - testFileRoundtrip(US_ASCII); + public void testCP437FileRoundtripImplicitUnicodeExtra() + throws IOException { + testFileRoundtrip(CP437, false, false); } - private static void testFileRoundtrip(String encoding) + public void testASCIIFileRoundtripImplicitUnicodeExtra() throws IOException { + testFileRoundtrip(US_ASCII, false, false); + } + public void testZipFileReadsUnicodeFields() throws IOException { + File file = File.createTempFile("unicode-test", ".zip"); + ZipFile zf = null; try { - Charset.forName(encoding); - } catch (UnsupportedCharsetException use) { - System.err.println("Skipping testFileRoundtrip for unsupported " - + " encoding " + encoding); - return; + createTestFile(file, US_ASCII, false, true); + zf = new ZipFile(file, US_ASCII, true); + assertNotNull(zf.getEntry(ASCII_TXT)); + assertNotNull(zf.getEntry(EURO_FOR_DOLLAR_TXT)); + assertNotNull(zf.getEntry(OIL_BARREL_TXT)); + } finally { + ZipFile.closeQuietly(zf); + if (file.exists()) { + file.delete(); + } } + } + + private static void testFileRoundtrip(String encoding, boolean withEFS, + boolean withExplicitUnicodeExtra) + throws IOException { File file = File.createTempFile(encoding + "-test", ".zip"); try { - createTestFile(file, encoding); + createTestFile(file, encoding, withEFS, withExplicitUnicodeExtra); testFile(file, encoding); } finally { if (file.exists()) { @@ -71,19 +110,30 @@ public class UTF8ZipFilesTest extends TestCase { } } - private static void createTestFile(File file, String encoding) + private static void createTestFile(File file, String encoding, + boolean withEFS, + boolean withExplicitUnicodeExtra) throws UnsupportedEncodingException, IOException { + ZipEncoding zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); + ZipOutputStream zos = null; try { zos = new ZipOutputStream(file); zos.setEncoding(encoding); + zos.setUseLanguageEncodingFlag(withEFS); + zos.setCreateUnicodeExtraFields(!withExplicitUnicodeExtra); ZipEntry ze = new ZipEntry(OIL_BARREL_TXT); - if (!ZipEncodingHelper.canEncodeName(ze.getName(), - zos.getEncoding())) { + if (withExplicitUnicodeExtra + && !zipEncoding.canEncode(ze.getName())) { + + ByteBuffer en = zipEncoding.encode(ze.getName()); + ze.addExtraField(new UnicodePathExtraField(ze.getName(), - zos.getEncoding())); + en.array(), + en.arrayOffset(), + en.limit())); } zos.putNextEntry(ze); @@ -91,10 +141,15 @@ public class UTF8ZipFilesTest extends TestCase { zos.closeEntry(); ze = new ZipEntry(EURO_FOR_DOLLAR_TXT); - if (!ZipEncodingHelper.canEncodeName(ze.getName(), - zos.getEncoding())) { + if (withExplicitUnicodeExtra + && !zipEncoding.canEncode(ze.getName())) { + + ByteBuffer en = zipEncoding.encode(ze.getName()); + ze.addExtraField(new UnicodePathExtraField(ze.getName(), - zos.getEncoding())); + en.array(), + en.arrayOffset(), + en.limit())); } zos.putNextEntry(ze); @@ -103,10 +158,15 @@ public class UTF8ZipFilesTest extends TestCase { ze = new ZipEntry(ASCII_TXT); - if (!ZipEncodingHelper.canEncodeName(ze.getName(), - zos.getEncoding())) { + if (withExplicitUnicodeExtra + && !zipEncoding.canEncode(ze.getName())) { + + ByteBuffer en = zipEncoding.encode(ze.getName()); + ze.addExtraField(new UnicodePathExtraField(ze.getName(), - zos.getEncoding())); + en.array(), + en.arrayOffset(), + en.limit())); } zos.putNextEntry(ze); @@ -125,7 +185,7 @@ public class UTF8ZipFilesTest extends TestCase { throws IOException { ZipFile zf = null; try { - zf = new ZipFile(file, encoding); + zf = new ZipFile(file, encoding, false); Enumeration e = zf.getEntries(); while (e.hasMoreElements()) { @@ -147,14 +207,8 @@ public class UTF8ZipFilesTest extends TestCase { } private static UnicodePathExtraField findUniCodePath(ZipEntry ze) { - - ZipExtraField[] efs = ze.getExtraFields(); - for (int i = 0; i < efs.length; ++i) { - if (efs[i].getHeaderId().equals(UnicodePathExtraField.UPATH_ID)) { - return (UnicodePathExtraField) efs[i]; - } - } - return null; + return (UnicodePathExtraField) + ze.getExtraField(UnicodePathExtraField.UPATH_ID); } private static void assertUnicodeName(ZipEntry ze, @@ -165,23 +219,17 @@ public class UTF8ZipFilesTest extends TestCase { UnicodePathExtraField ucpf = findUniCodePath(ze); assertNotNull(ucpf); - UnicodePathExtraField ucpe = new UnicodePathExtraField(expectedName, - encoding); - assertEquals(ucpe.getNameCRC32(), ucpf.getNameCRC32()); + ZipEncoding enc = ZipEncodingHelper.getZipEncoding(encoding); + ByteBuffer ne = enc.encode(ze.getName()); + + CRC32 crc = new CRC32(); + crc.update(ne.array(),ne.arrayOffset(),ne.limit()); + + assertEquals(crc.getValue(), ucpf.getNameCRC32()); assertEquals(expectedName, new String(ucpf.getUnicodeName(), UTF_8)); } } - /* - public void testUtf8Interoperability() throws IOException { - File file1 = super.getFile("utf8-7zip-test.zip"); - File file2 = super.getFile("utf8-winzip-test.zip"); - - testFile(file1,CP437); - testFile(file2,CP437); - - } - */ } diff --git a/src/tests/junit/org/apache/tools/zip/ZipEncodingTest.java b/src/tests/junit/org/apache/tools/zip/ZipEncodingTest.java new file mode 100644 index 000000000..d935fa988 --- /dev/null +++ b/src/tests/junit/org/apache/tools/zip/ZipEncodingTest.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tools.zip; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import junit.framework.TestCase; + +/** + * Test zip encodings. + */ +public class ZipEncodingTest extends TestCase { + private static final String UNENC_STRING = "\u2016"; + + // stress test for internal grow method. + private static final String BAD_STRING = + "\u2016\u2015\u2016\u2015\u2016\u2015\u2016\u2015\u2016\u2015\u2016"; + + private static final String BAD_STRING_ENC = + "%U2016%U2015%U2016%U2015%U2016%U2015%U2016%U2015%U2016%U2015%U2016"; + + public void testSimpleCp437Encoding() throws IOException { + + doSimpleEncodingTest("Cp437", null); + } + + public void testSimpleCp850Encoding() throws IOException { + + doSimpleEncodingTest("Cp850", null); + } + + public void testNioCp1252Encoding() throws IOException { + // CP1252 has some undefined code points, these are + // the defined ones + // retrieved by + // awk '/^0x/ && NF>2 {print $1;}' CP1252.TXT + byte[] b = + new byte[] { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + (byte) 0x80, (byte) 0x82, (byte) 0x83, (byte) 0x84, + (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, + (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, + (byte) 0x8E, (byte) 0x91, (byte) 0x92, (byte) 0x93, + (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, + (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, + (byte) 0x9C, (byte) 0x9E, (byte) 0x9F, (byte) 0xA0, + (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, + (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, + (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, + (byte) 0xAD, (byte) 0xAE, (byte) 0xAF, (byte) 0xB0, + (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, + (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, + (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, + (byte) 0xBD, (byte) 0xBE, (byte) 0xBF, (byte) 0xC0, + (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, + (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, + (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, + (byte) 0xCD, (byte) 0xCE, (byte) 0xCF, (byte) 0xD0, + (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, + (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, + (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, + (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, (byte) 0xE0, + (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, + (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, + (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, + (byte) 0xED, (byte) 0xEE, (byte) 0xEF, (byte) 0xF0, + (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, + (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, + (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, + (byte) 0xFD, (byte) 0xFE, (byte) 0xFF }; + + doSimpleEncodingTest("Cp1252",b); + } + + private static final void assertEquals(byte[] expected, ByteBuffer actual) { + + assertEquals(expected.length, actual.limit()); + + for (int i = 0; i < expected.length; ++i) { + + byte a = actual.get(); + assertEquals(expected[i], a); + } + + } + + private void doSimpleEncodingTest(String name, byte[] testBytes) + throws IOException { + + ZipEncoding enc = ZipEncodingHelper.getZipEncoding(name); + + if (testBytes == null) { + + testBytes = new byte[256]; + for (int i = 0; i < 256; ++i) { + testBytes[i] = (byte) i; + } + } + + String decoded = enc.decode(testBytes); + + assertEquals(true, enc.canEncode(decoded)); + + ByteBuffer encoded = enc.encode(decoded); + + assertEquals(testBytes, encoded); + + assertEquals(false, enc.canEncode(UNENC_STRING)); + assertEquals("%U2016".getBytes("US-ASCII"), enc.encode(UNENC_STRING)); + assertEquals(false, enc.canEncode(BAD_STRING)); + assertEquals(BAD_STRING_ENC.getBytes("US-ASCII"), + enc.encode(BAD_STRING)); + } + +}