You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

charset_test.go 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "testing"
  7. "code.gitea.io/gitea/modules/setting"
  8. "github.com/stretchr/testify/assert"
  9. )
  10. func TestRemoveBOMIfPresent(t *testing.T) {
  11. res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  12. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  13. res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  14. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  15. }
  16. func TestToUTF8WithErr(t *testing.T) {
  17. var res string
  18. var err error
  19. // Note: golang compiler seems so behave differently depending on the current
  20. // locale, so some conversions might behave differently. For that reason, we don't
  21. // depend on particular conversions but in expected behaviors.
  22. res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
  23. assert.NoError(t, err)
  24. assert.Equal(t, "ABC", res)
  25. // "áéíóú"
  26. res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  27. assert.NoError(t, err)
  28. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  29. // "áéíóú"
  30. res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  31. 0xc3, 0xba})
  32. assert.NoError(t, err)
  33. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  34. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  35. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  36. assert.NoError(t, err)
  37. stringMustStartWith(t, "Hola,", res)
  38. stringMustEndWith(t, "AAA.", res)
  39. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  40. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  41. assert.NoError(t, err)
  42. stringMustStartWith(t, "Hola,", res)
  43. stringMustEndWith(t, "AAA.", res)
  44. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  45. 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  46. assert.NoError(t, err)
  47. stringMustStartWith(t, "Hola,", res)
  48. stringMustEndWith(t, "AAA.", res)
  49. // Japanese (Shift-JIS)
  50. // 日属秘ぞしちゅ。
  51. res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  52. 0xBF, 0x82, 0xE3, 0x81, 0x42})
  53. assert.NoError(t, err)
  54. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  55. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  56. []byte(res))
  57. res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
  58. assert.NoError(t, err)
  59. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  60. }
  61. func TestToUTF8WithFallback(t *testing.T) {
  62. // "ABC"
  63. res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
  64. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  65. // "áéíóú"
  66. res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  67. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  68. // UTF8 BOM + "áéíóú"
  69. res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  70. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  71. // "Hola, así cómo ños"
  72. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  73. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  74. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
  75. 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  76. // "Hola, así cómo "
  77. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  78. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  79. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  80. assert.Equal(t, minmatch, res[0:len(minmatch)])
  81. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  82. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  83. assert.Equal(t, minmatch, res[0:len(minmatch)])
  84. // Japanese (Shift-JIS)
  85. // "日属秘ぞしちゅ。"
  86. res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  87. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  88. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  89. res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
  90. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  91. }
  92. func TestToUTF8(t *testing.T) {
  93. // Note: golang compiler seems so behave differently depending on the current
  94. // locale, so some conversions might behave differently. For that reason, we don't
  95. // depend on particular conversions but in expected behaviors.
  96. res := ToUTF8(string([]byte{0x41, 0x42, 0x43}))
  97. assert.Equal(t, "ABC", res)
  98. // "áéíóú"
  99. res = ToUTF8(string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}))
  100. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  101. // BOM + "áéíóú"
  102. res = ToUTF8(string([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  103. 0xc3, 0xba}))
  104. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  105. // Latin1
  106. // Hola, así cómo ños
  107. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  108. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}))
  109. assert.Equal(t, []byte{0x48, 0x6f, 0x6c, 0x61, 0x2c, 0x20, 0x61, 0x73, 0xc3, 0xad, 0x20, 0x63,
  110. 0xc3, 0xb3, 0x6d, 0x6f, 0x20, 0xc3, 0xb1, 0x6f, 0x73}, []byte(res))
  111. // Latin1
  112. // Hola, así cómo \x07ños
  113. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  114. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}))
  115. // Hola,
  116. bytesMustStartWith(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C}, []byte(res))
  117. // This test FAILS
  118. // res = ToUTF8("Hola, así cómo \x81ños")
  119. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  120. // assert.Regexp(t, "^Hola, así cómo", res)
  121. // Japanese (Shift-JIS)
  122. // 日属秘ぞしちゅ。
  123. res = ToUTF8(string([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  124. 0xBF, 0x82, 0xE3, 0x81, 0x42}))
  125. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  126. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  127. []byte(res))
  128. res = ToUTF8("\x00\x00\x00\x00")
  129. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  130. }
  131. func TestToUTF8DropErrors(t *testing.T) {
  132. // "ABC"
  133. res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
  134. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  135. // "áéíóú"
  136. res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  137. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  138. // UTF8 BOM + "áéíóú"
  139. res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  140. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  141. // "Hola, así cómo ños"
  142. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  143. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  144. // "Hola, así cómo "
  145. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  146. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  147. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  148. assert.Equal(t, minmatch, res[0:len(minmatch)])
  149. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  150. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  151. assert.Equal(t, minmatch, res[0:len(minmatch)])
  152. // Japanese (Shift-JIS)
  153. // "日属秘ぞしちゅ。"
  154. res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  155. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  156. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  157. res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
  158. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  159. }
  160. func TestDetectEncoding(t *testing.T) {
  161. testSuccess := func(b []byte, expected string) {
  162. encoding, err := DetectEncoding(b)
  163. assert.NoError(t, err)
  164. assert.Equal(t, expected, encoding)
  165. }
  166. // utf-8
  167. b := []byte("just some ascii")
  168. testSuccess(b, "UTF-8")
  169. // utf-8-sig: "hey" (with BOM)
  170. b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
  171. testSuccess(b, "UTF-8")
  172. // utf-16: "hey<accented G>"
  173. b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
  174. testSuccess(b, "UTF-16LE")
  175. // iso-8859-1: d<accented e>cor<newline>
  176. b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
  177. encoding, err := DetectEncoding(b)
  178. assert.NoError(t, err)
  179. // due to a race condition in `chardet` library, it could either detect
  180. // "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
  181. // we accept either.
  182. assert.Contains(t, encoding, "ISO-8859")
  183. setting.Repository.AnsiCharset = "placeholder"
  184. testSuccess(b, "placeholder")
  185. // invalid bytes
  186. b = []byte{0xfa}
  187. _, err = DetectEncoding(b)
  188. assert.Error(t, err)
  189. }
  190. func stringMustStartWith(t *testing.T, expected string, value string) {
  191. assert.Equal(t, expected, string(value[:len(expected)]))
  192. }
  193. func stringMustEndWith(t *testing.T, expected string, value string) {
  194. assert.Equal(t, expected, string(value[len(value)-len(expected):]))
  195. }
  196. func bytesMustStartWith(t *testing.T, expected []byte, value []byte) {
  197. assert.Equal(t, expected, value[:len(expected)])
  198. }