You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

charset_test.go 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "testing"
  7. "code.gitea.io/gitea/modules/setting"
  8. "github.com/stretchr/testify/assert"
  9. )
  10. func TestRemoveBOMIfPresent(t *testing.T) {
  11. res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  12. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  13. res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  14. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  15. }
  16. func TestToUTF8WithErr(t *testing.T) {
  17. var res string
  18. var err error
  19. res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
  20. assert.Equal(t, "ABC", res)
  21. assert.NoError(t, err)
  22. res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  23. assert.Equal(t, "áéíóú", res)
  24. assert.NoError(t, err)
  25. res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  26. assert.Equal(t, "áéíóú", res)
  27. assert.NoError(t, err)
  28. // This test FAILS
  29. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  30. assert.Equal(t, "Hola, así cómo ños", res)
  31. assert.NoError(t, err)
  32. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  33. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  34. assert.Regexp(t, "^Hola, así cómo", res)
  35. assert.NoError(t, err)
  36. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  37. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  38. assert.Regexp(t, "^Hola, así cómo", res)
  39. assert.NoError(t, err)
  40. // Japanese (Shift-JIS)
  41. res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  42. assert.Equal(t, "日属秘ぞしちゅ。", res)
  43. assert.NoError(t, err)
  44. res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
  45. assert.Equal(t, "\x00\x00\x00\x00", res)
  46. assert.NoError(t, err)
  47. }
  48. func TestToUTF8WithFallback(t *testing.T) {
  49. res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
  50. assert.Equal(t, []byte("ABC"), res)
  51. res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  52. assert.Equal(t, []byte("áéíóú"), res)
  53. res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  54. assert.Equal(t, []byte("áéíóú"), res)
  55. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  56. assert.Equal(t, []byte("Hola, así cómo ños"), res)
  57. minmatch := []byte("Hola, así cómo ")
  58. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  59. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  60. assert.Equal(t, minmatch, res[0:len(minmatch)])
  61. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  62. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  63. assert.Equal(t, minmatch, res[0:len(minmatch)])
  64. // Japanese (Shift-JIS)
  65. res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  66. assert.Equal(t, []byte("日属秘ぞしちゅ。"), res)
  67. res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
  68. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  69. }
  70. func TestToUTF8(t *testing.T) {
  71. res := ToUTF8("ABC")
  72. assert.Equal(t, "ABC", res)
  73. res = ToUTF8("áéíóú")
  74. assert.Equal(t, "áéíóú", res)
  75. // With utf-8 BOM
  76. res = ToUTF8("\ufeffáéíóú")
  77. assert.Equal(t, "áéíóú", res)
  78. res = ToUTF8("Hola, así cómo ños")
  79. assert.Equal(t, "Hola, así cómo ños", res)
  80. res = ToUTF8("Hola, así cómo \x07ños")
  81. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  82. assert.Regexp(t, "^Hola, así cómo", res)
  83. // This test FAILS
  84. // res = ToUTF8("Hola, así cómo \x81ños")
  85. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  86. // assert.Regexp(t, "^Hola, así cómo", res)
  87. // Japanese (Shift-JIS)
  88. res = ToUTF8("\x93\xFA\x91\xAE\x94\xE9\x82\xBC\x82\xB5\x82\xBF\x82\xE3\x81\x42")
  89. assert.Equal(t, "日属秘ぞしちゅ。", res)
  90. res = ToUTF8("\x00\x00\x00\x00")
  91. assert.Equal(t, "\x00\x00\x00\x00", res)
  92. }
  93. func TestToUTF8DropErrors(t *testing.T) {
  94. res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
  95. assert.Equal(t, []byte("ABC"), res)
  96. res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  97. assert.Equal(t, []byte("áéíóú"), res)
  98. res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  99. assert.Equal(t, []byte("áéíóú"), res)
  100. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  101. assert.Equal(t, []byte("Hola, así cómo ños"), res)
  102. minmatch := []byte("Hola, así cómo ")
  103. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  104. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  105. assert.Equal(t, minmatch, res[0:len(minmatch)])
  106. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  107. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  108. assert.Equal(t, minmatch, res[0:len(minmatch)])
  109. // Japanese (Shift-JIS)
  110. res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  111. assert.Equal(t, []byte("日属秘ぞしちゅ。"), res)
  112. res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
  113. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  114. }
  115. func TestDetectEncoding(t *testing.T) {
  116. testSuccess := func(b []byte, expected string) {
  117. encoding, err := DetectEncoding(b)
  118. assert.NoError(t, err)
  119. assert.Equal(t, expected, encoding)
  120. }
  121. // utf-8
  122. b := []byte("just some ascii")
  123. testSuccess(b, "UTF-8")
  124. // utf-8-sig: "hey" (with BOM)
  125. b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
  126. testSuccess(b, "UTF-8")
  127. // utf-16: "hey<accented G>"
  128. b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
  129. testSuccess(b, "UTF-16LE")
  130. // iso-8859-1: d<accented e>cor<newline>
  131. b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
  132. encoding, err := DetectEncoding(b)
  133. assert.NoError(t, err)
  134. // due to a race condition in `chardet` library, it could either detect
  135. // "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
  136. // we accept either.
  137. assert.Contains(t, encoding, "ISO-8859")
  138. setting.Repository.AnsiCharset = "placeholder"
  139. testSuccess(b, "placeholder")
  140. // invalid bytes
  141. b = []byte{0xfa}
  142. _, err = DetectEncoding(b)
  143. assert.Error(t, err)
  144. }