* Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior.tags/v1.21.12.1
| @@ -725,6 +725,14 @@ | |||
| pruneopts = "NUT" | |||
| revision = "02ccfbfaf0cc627aa3aec8ef7ed5cfeec5b43f63" | |||
| [[projects]] | |||
| digest = "1:63953ffb90bbc880c612d576fcfd973a5904277d25ec9e2d8d5719bf67969662" | |||
| name = "github.com/mvdan/xurls" | |||
| packages = ["."] | |||
| pruneopts = "NUT" | |||
| revision = "e52e821cbfe8fe163ff6f8628ab5869b11fc05af" | |||
| version = "v2.0.0" | |||
| [[projects]] | |||
| digest = "1:2be1d891535ce3d6d2a3db9087f07415e909744e9eff1a30f8f0b2519df60ae6" | |||
| name = "github.com/nfnt/resize" | |||
| @@ -1293,6 +1301,7 @@ | |||
| "github.com/mcuadros/go-version", | |||
| "github.com/microcosm-cc/bluemonday", | |||
| "github.com/msteinert/pam", | |||
| "github.com/mvdan/xurls", | |||
| "github.com/nfnt/resize", | |||
| "github.com/pquerna/otp", | |||
| "github.com/pquerna/otp/totp", | |||
| @@ -113,3 +113,7 @@ ignored = ["google.golang.org/appengine*"] | |||
| [[constraint]] | |||
| name = "github.com/prometheus/client_golang" | |||
| version = "0.9.0" | |||
| [[constraint]] | |||
| name = "github.com/mvdan/xurls" | |||
| version = "2.0.0" | |||
| @@ -17,6 +17,7 @@ import ( | |||
| "code.gitea.io/gitea/modules/util" | |||
| "github.com/Unknwon/com" | |||
| "github.com/mvdan/xurls" | |||
| "golang.org/x/net/html" | |||
| "golang.org/x/net/html/atom" | |||
| ) | |||
| @@ -64,9 +65,7 @@ var ( | |||
| // https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail) | |||
| emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") | |||
| // matches http/https links. used for autlinking those. partly modified from | |||
| // the original present in autolink.js | |||
| linkRegex = regexp.MustCompile(`(?:(?:http|https):\/\/(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+(?:\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)(?:(?:\/[\+~%\/\.\w\-]*)?\??(?:[\-\+:=&;%@\.\w]*)#?(?:[\.\!\/\\\w]*))?`) | |||
| linkRegex, _ = xurls.StrictMatchingScheme("https?://") | |||
| ) | |||
| // regexp for full links to issues/pulls | |||
| @@ -104,6 +104,15 @@ func TestRender_links(t *testing.T) { | |||
| test( | |||
| "http://142.42.1.1/", | |||
| `<p><a href="http://142.42.1.1/" rel="nofollow">http://142.42.1.1/</a></p>`) | |||
| test( | |||
| "https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd", | |||
| `<p><a href="https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd" rel="nofollow">https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd</a></p>`) | |||
| test( | |||
| "https://en.wikipedia.org/wiki/URL_(disambiguation)", | |||
| `<p><a href="https://en.wikipedia.org/wiki/URL_(disambiguation)" rel="nofollow">https://en.wikipedia.org/wiki/URL_(disambiguation)</a></p>`) | |||
| test( | |||
| "https://foo_bar.example.com/", | |||
| `<p><a href="https://foo_bar.example.com/" rel="nofollow">https://foo_bar.example.com/</a></p>`) | |||
| // Test that should *not* be turned into URL | |||
| test( | |||
| @@ -0,0 +1,27 @@ | |||
| Copyright (c) 2015, Daniel Martí. All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| * Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above | |||
| copyright notice, this list of conditions and the following disclaimer | |||
| in the documentation and/or other materials provided with the | |||
| distribution. | |||
| * Neither the name of the copyright holder nor the names of its | |||
| contributors may be used to endorse or promote products derived from | |||
| this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |||
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |||
| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |||
| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |||
| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |||
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |||
| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |||
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| @@ -0,0 +1,299 @@ | |||
| // Generated by schemesgen | |||
| package xurls | |||
| // Schemes is a sorted list of all IANA assigned schemes. | |||
| // | |||
| // Source: | |||
| // https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv | |||
| var Schemes = []string{ | |||
| `aaa`, | |||
| `aaas`, | |||
| `about`, | |||
| `acap`, | |||
| `acct`, | |||
| `acr`, | |||
| `adiumxtra`, | |||
| `afp`, | |||
| `afs`, | |||
| `aim`, | |||
| `appdata`, | |||
| `apt`, | |||
| `attachment`, | |||
| `aw`, | |||
| `barion`, | |||
| `beshare`, | |||
| `bitcoin`, | |||
| `bitcoincash`, | |||
| `blob`, | |||
| `bolo`, | |||
| `browserext`, | |||
| `callto`, | |||
| `cap`, | |||
| `chrome`, | |||
| `chrome-extension`, | |||
| `cid`, | |||
| `coap`, | |||
| `coap+tcp`, | |||
| `coap+ws`, | |||
| `coaps`, | |||
| `coaps+tcp`, | |||
| `coaps+ws`, | |||
| `com-eventbrite-attendee`, | |||
| `content`, | |||
| `conti`, | |||
| `crid`, | |||
| `cvs`, | |||
| `data`, | |||
| `dav`, | |||
| `diaspora`, | |||
| `dict`, | |||
| `did`, | |||
| `dis`, | |||
| `dlna-playcontainer`, | |||
| `dlna-playsingle`, | |||
| `dns`, | |||
| `dntp`, | |||
| `dtn`, | |||
| `dvb`, | |||
| `ed2k`, | |||
| `elsi`, | |||
| `example`, | |||
| `facetime`, | |||
| `fax`, | |||
| `feed`, | |||
| `feedready`, | |||
| `file`, | |||
| `filesystem`, | |||
| `finger`, | |||
| `fish`, | |||
| `ftp`, | |||
| `geo`, | |||
| `gg`, | |||
| `git`, | |||
| `gizmoproject`, | |||
| `go`, | |||
| `gopher`, | |||
| `graph`, | |||
| `gtalk`, | |||
| `h323`, | |||
| `ham`, | |||
| `hcap`, | |||
| `hcp`, | |||
| `http`, | |||
| `https`, | |||
| `hxxp`, | |||
| `hxxps`, | |||
| `hydrazone`, | |||
| `iax`, | |||
| `icap`, | |||
| `icon`, | |||
| `im`, | |||
| `imap`, | |||
| `info`, | |||
| `iotdisco`, | |||
| `ipn`, | |||
| `ipp`, | |||
| `ipps`, | |||
| `irc`, | |||
| `irc6`, | |||
| `ircs`, | |||
| `iris`, | |||
| `iris.beep`, | |||
| `iris.lwz`, | |||
| `iris.xpc`, | |||
| `iris.xpcs`, | |||
| `isostore`, | |||
| `itms`, | |||
| `jabber`, | |||
| `jar`, | |||
| `jms`, | |||
| `keyparc`, | |||
| `lastfm`, | |||
| `ldap`, | |||
| `ldaps`, | |||
| `lvlt`, | |||
| `magnet`, | |||
| `mailserver`, | |||
| `mailto`, | |||
| `maps`, | |||
| `market`, | |||
| `message`, | |||
| `microsoft.windows.camera`, | |||
| `microsoft.windows.camera.multipicker`, | |||
| `microsoft.windows.camera.picker`, | |||
| `mid`, | |||
| `mms`, | |||
| `modem`, | |||
| `mongodb`, | |||
| `moz`, | |||
| `ms-access`, | |||
| `ms-browser-extension`, | |||
| `ms-drive-to`, | |||
| `ms-enrollment`, | |||
| `ms-excel`, | |||
| `ms-gamebarservices`, | |||
| `ms-gamingoverlay`, | |||
| `ms-getoffice`, | |||
| `ms-help`, | |||
| `ms-infopath`, | |||
| `ms-inputapp`, | |||
| `ms-lockscreencomponent-config`, | |||
| `ms-media-stream-id`, | |||
| `ms-mixedrealitycapture`, | |||
| `ms-officeapp`, | |||
| `ms-people`, | |||
| `ms-project`, | |||
| `ms-powerpoint`, | |||
| `ms-publisher`, | |||
| `ms-restoretabcompanion`, | |||
| `ms-screenclip`, | |||
| `ms-screensketch`, | |||
| `ms-search`, | |||
| `ms-search-repair`, | |||
| `ms-secondary-screen-controller`, | |||
| `ms-secondary-screen-setup`, | |||
| `ms-settings`, | |||
| `ms-settings-airplanemode`, | |||
| `ms-settings-bluetooth`, | |||
| `ms-settings-camera`, | |||
| `ms-settings-cellular`, | |||
| `ms-settings-cloudstorage`, | |||
| `ms-settings-connectabledevices`, | |||
| `ms-settings-displays-topology`, | |||
| `ms-settings-emailandaccounts`, | |||
| `ms-settings-language`, | |||
| `ms-settings-location`, | |||
| `ms-settings-lock`, | |||
| `ms-settings-nfctransactions`, | |||
| `ms-settings-notifications`, | |||
| `ms-settings-power`, | |||
| `ms-settings-privacy`, | |||
| `ms-settings-proximity`, | |||
| `ms-settings-screenrotation`, | |||
| `ms-settings-wifi`, | |||
| `ms-settings-workplace`, | |||
| `ms-spd`, | |||
| `ms-sttoverlay`, | |||
| `ms-transit-to`, | |||
| `ms-useractivityset`, | |||
| `ms-virtualtouchpad`, | |||
| `ms-visio`, | |||
| `ms-walk-to`, | |||
| `ms-whiteboard`, | |||
| `ms-whiteboard-cmd`, | |||
| `ms-word`, | |||
| `msnim`, | |||
| `msrp`, | |||
| `msrps`, | |||
| `mtqp`, | |||
| `mumble`, | |||
| `mupdate`, | |||
| `mvn`, | |||
| `news`, | |||
| `nfs`, | |||
| `ni`, | |||
| `nih`, | |||
| `nntp`, | |||
| `notes`, | |||
| `ocf`, | |||
| `oid`, | |||
| `onenote`, | |||
| `onenote-cmd`, | |||
| `opaquelocktoken`, | |||
| `openpgp4fpr`, | |||
| `pack`, | |||
| `palm`, | |||
| `paparazzi`, | |||
| `pkcs11`, | |||
| `platform`, | |||
| `pop`, | |||
| `pres`, | |||
| `prospero`, | |||
| `proxy`, | |||
| `pwid`, | |||
| `psyc`, | |||
| `qb`, | |||
| `query`, | |||
| `redis`, | |||
| `rediss`, | |||
| `reload`, | |||
| `res`, | |||
| `resource`, | |||
| `rmi`, | |||
| `rsync`, | |||
| `rtmfp`, | |||
| `rtmp`, | |||
| `rtsp`, | |||
| `rtsps`, | |||
| `rtspu`, | |||
| `secondlife`, | |||
| `service`, | |||
| `session`, | |||
| `sftp`, | |||
| `sgn`, | |||
| `shttp`, | |||
| `sieve`, | |||
| `simpleledger`, | |||
| `sip`, | |||
| `sips`, | |||
| `skype`, | |||
| `smb`, | |||
| `sms`, | |||
| `smtp`, | |||
| `snews`, | |||
| `snmp`, | |||
| `soap.beep`, | |||
| `soap.beeps`, | |||
| `soldat`, | |||
| `spiffe`, | |||
| `spotify`, | |||
| `ssh`, | |||
| `steam`, | |||
| `stun`, | |||
| `stuns`, | |||
| `submit`, | |||
| `svn`, | |||
| `tag`, | |||
| `teamspeak`, | |||
| `tel`, | |||
| `teliaeid`, | |||
| `telnet`, | |||
| `tftp`, | |||
| `things`, | |||
| `thismessage`, | |||
| `tip`, | |||
| `tn3270`, | |||
| `tool`, | |||
| `turn`, | |||
| `turns`, | |||
| `tv`, | |||
| `udp`, | |||
| `unreal`, | |||
| `urn`, | |||
| `ut2004`, | |||
| `v-event`, | |||
| `vemmi`, | |||
| `ventrilo`, | |||
| `videotex`, | |||
| `vnc`, | |||
| `view-source`, | |||
| `wais`, | |||
| `webcal`, | |||
| `wpid`, | |||
| `ws`, | |||
| `wss`, | |||
| `wtai`, | |||
| `wyciwyg`, | |||
| `xcon`, | |||
| `xcon-userid`, | |||
| `xfire`, | |||
| `xmlrpc.beep`, | |||
| `xmlrpc.beeps`, | |||
| `xmpp`, | |||
| `xri`, | |||
| `ymsgr`, | |||
| `z39.50`, | |||
| `z39.50r`, | |||
| `z39.50s`, | |||
| } | |||
| @@ -0,0 +1,24 @@ | |||
| // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc> | |||
| // See LICENSE for licensing information | |||
| package xurls | |||
| // PseudoTLDs is a sorted list of some widely used unofficial TLDs. | |||
| // | |||
| // Sources: | |||
| // * https://en.wikipedia.org/wiki/Pseudo-top-level_domain | |||
| // * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains | |||
| // * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00 | |||
| // * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml | |||
| var PseudoTLDs = []string{ | |||
| `bit`, // Namecoin | |||
| `example`, // Example domain | |||
| `exit`, // Tor exit node | |||
| `gnu`, // GNS by public key | |||
| `i2p`, // I2P network | |||
| `invalid`, // Invalid domain | |||
| `local`, // Local network | |||
| `localhost`, // Local network | |||
| `test`, // Test domain | |||
| `zkey`, // GNS domain name | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc> | |||
| // See LICENSE for licensing information | |||
| // Package xurls extracts urls from plain text using regular expressions. | |||
| package xurls | |||
| import ( | |||
| "bytes" | |||
| "regexp" | |||
| ) | |||
| //go:generate go run generate/tldsgen/main.go | |||
| //go:generate go run generate/schemesgen/main.go | |||
| const ( | |||
| letter = `\p{L}` | |||
| mark = `\p{M}` | |||
| number = `\p{N}` | |||
| iriChar = letter + mark + number | |||
| currency = `\p{Sc}` | |||
| otherSymb = `\p{So}` | |||
| endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb | |||
| otherPunc = `\p{Po}` | |||
| midChar = endChar + `|` + otherPunc | |||
| wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` | |||
| wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` | |||
| wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` | |||
| wellAll = wellParen + `|` + wellBrack + `|` + wellBrace | |||
| pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` | |||
| iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` | |||
| domain = `(` + iri + `\.)+` | |||
| octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` | |||
| ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` | |||
| ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` | |||
| ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` | |||
| port = `(:[0-9]*)?` | |||
| ) | |||
| // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid | |||
| // scheme, and not just the known ones. | |||
| var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` | |||
| // SchemesNoAuthority is a sorted list of some well-known url schemes that are | |||
| // followed by ":" instead of "://". | |||
| var SchemesNoAuthority = []string{ | |||
| `bitcoin`, // Bitcoin | |||
| `file`, // Files | |||
| `magnet`, // Torrent magnets | |||
| `mailto`, // Mail | |||
| `sms`, // SMS | |||
| `tel`, // Telephone | |||
| `xmpp`, // XMPP | |||
| } | |||
| func anyOf(strs ...string) string { | |||
| var b bytes.Buffer | |||
| b.WriteByte('(') | |||
| for i, s := range strs { | |||
| if i != 0 { | |||
| b.WriteByte('|') | |||
| } | |||
| b.WriteString(regexp.QuoteMeta(s)) | |||
| } | |||
| b.WriteByte(')') | |||
| return b.String() | |||
| } | |||
| func strictExp() string { | |||
| schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)` | |||
| return `(?i)` + schemes + `(?-i)` + pathCont | |||
| } | |||
| func relaxedExp() string { | |||
| site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)` | |||
| hostName := `(` + site + `|` + ipAddr + `)` | |||
| webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)` | |||
| return strictExp() + `|` + webURL | |||
| } | |||
| // Strict produces a regexp that matches any URL with a scheme in either the | |||
| // Schemes or SchemesNoAuthority lists. | |||
| func Strict() *regexp.Regexp { | |||
| re := regexp.MustCompile(strictExp()) | |||
| re.Longest() | |||
| return re | |||
| } | |||
| // Relaxed produces a regexp that matches any URL matched by Strict, plus any | |||
| // URL with no scheme. | |||
| func Relaxed() *regexp.Regexp { | |||
| re := regexp.MustCompile(relaxedExp()) | |||
| re.Longest() | |||
| return re | |||
| } | |||
| // StrictMatchingScheme produces a regexp similar to Strict, but requiring that | |||
| // the scheme match the given regular expression. See AnyScheme too. | |||
| func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { | |||
| strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont | |||
| re, err := regexp.Compile(strictMatching) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| re.Longest() | |||
| return re, nil | |||
| } | |||