123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- // Copyright 2010 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
-
- package html
-
- import (
- "bytes"
- "strings"
- "unicode/utf8"
- )
-
- // These replacements permit compatibility with old numeric entities that
- // assumed Windows-1252 encoding.
- // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
- var replacementTable = [...]rune{
- '\u20AC', // First entry is what 0x80 should be replaced with.
- '\u0081',
- '\u201A',
- '\u0192',
- '\u201E',
- '\u2026',
- '\u2020',
- '\u2021',
- '\u02C6',
- '\u2030',
- '\u0160',
- '\u2039',
- '\u0152',
- '\u008D',
- '\u017D',
- '\u008F',
- '\u0090',
- '\u2018',
- '\u2019',
- '\u201C',
- '\u201D',
- '\u2022',
- '\u2013',
- '\u2014',
- '\u02DC',
- '\u2122',
- '\u0161',
- '\u203A',
- '\u0153',
- '\u009D',
- '\u017E',
- '\u0178', // Last entry is 0x9F.
- // 0x00->'\uFFFD' is handled programmatically.
- // 0x0D->'\u000D' is a no-op.
- }
-
- // unescapeEntity reads an entity like "<" from b[src:] and writes the
- // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
- // Precondition: b[src] == '&' && dst <= src.
- // attribute should be true if parsing an attribute value.
- func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
- // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
-
- // i starts at 1 because we already know that s[0] == '&'.
- i, s := 1, b[src:]
-
- if len(s) <= 1 {
- b[dst] = b[src]
- return dst + 1, src + 1
- }
-
- if s[i] == '#' {
- if len(s) <= 3 { // We need to have at least "&#.".
- b[dst] = b[src]
- return dst + 1, src + 1
- }
- i++
- c := s[i]
- hex := false
- if c == 'x' || c == 'X' {
- hex = true
- i++
- }
-
- x := '\x00'
- for i < len(s) {
- c = s[i]
- i++
- if hex {
- if '0' <= c && c <= '9' {
- x = 16*x + rune(c) - '0'
- continue
- } else if 'a' <= c && c <= 'f' {
- x = 16*x + rune(c) - 'a' + 10
- continue
- } else if 'A' <= c && c <= 'F' {
- x = 16*x + rune(c) - 'A' + 10
- continue
- }
- } else if '0' <= c && c <= '9' {
- x = 10*x + rune(c) - '0'
- continue
- }
- if c != ';' {
- i--
- }
- break
- }
-
- if i <= 3 { // No characters matched.
- b[dst] = b[src]
- return dst + 1, src + 1
- }
-
- if 0x80 <= x && x <= 0x9F {
- // Replace characters from Windows-1252 with UTF-8 equivalents.
- x = replacementTable[x-0x80]
- } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
- // Replace invalid characters with the replacement character.
- x = '\uFFFD'
- }
-
- return dst + utf8.EncodeRune(b[dst:], x), src + i
- }
-
- // Consume the maximum number of characters possible, with the
- // consumed characters matching one of the named references.
-
- for i < len(s) {
- c := s[i]
- i++
- // Lower-cased characters are more common in entities, so we check for them first.
- if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
- continue
- }
- if c != ';' {
- i--
- }
- break
- }
-
- entityName := string(s[1:i])
- if entityName == "" {
- // No-op.
- } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
- // No-op.
- } else if x := entity[entityName]; x != 0 {
- return dst + utf8.EncodeRune(b[dst:], x), src + i
- } else if x := entity2[entityName]; x[0] != 0 {
- dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
- return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
- } else if !attribute {
- maxLen := len(entityName) - 1
- if maxLen > longestEntityWithoutSemicolon {
- maxLen = longestEntityWithoutSemicolon
- }
- for j := maxLen; j > 1; j-- {
- if x := entity[entityName[:j]]; x != 0 {
- return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
- }
- }
- }
-
- dst1, src1 = dst+i, src+i
- copy(b[dst:dst1], b[src:src1])
- return dst1, src1
- }
-
- // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
- // attribute should be true if parsing an attribute value.
- func unescape(b []byte, attribute bool) []byte {
- for i, c := range b {
- if c == '&' {
- dst, src := unescapeEntity(b, i, i, attribute)
- for src < len(b) {
- c := b[src]
- if c == '&' {
- dst, src = unescapeEntity(b, dst, src, attribute)
- } else {
- b[dst] = c
- dst, src = dst+1, src+1
- }
- }
- return b[0:dst]
- }
- }
- return b
- }
-
- // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
- func lower(b []byte) []byte {
- for i, c := range b {
- if 'A' <= c && c <= 'Z' {
- b[i] = c + 'a' - 'A'
- }
- }
- return b
- }
-
- const escapedChars = "&'<>\"\r"
-
- func escape(w writer, s string) error {
- i := strings.IndexAny(s, escapedChars)
- for i != -1 {
- if _, err := w.WriteString(s[:i]); err != nil {
- return err
- }
- var esc string
- switch s[i] {
- case '&':
- esc = "&"
- case '\'':
- // "'" is shorter than "'" and apos was not in HTML until HTML5.
- esc = "'"
- case '<':
- esc = "<"
- case '>':
- esc = ">"
- case '"':
- // """ is shorter than """.
- esc = """
- case '\r':
- esc = " "
- default:
- panic("unrecognized escape character")
- }
- s = s[i+1:]
- if _, err := w.WriteString(esc); err != nil {
- return err
- }
- i = strings.IndexAny(s, escapedChars)
- }
- _, err := w.WriteString(s)
- return err
- }
-
- // EscapeString escapes special characters like "<" to become "<". It
- // escapes only five such characters: <, >, &, ' and ".
- // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
- // always true.
- func EscapeString(s string) string {
- if strings.IndexAny(s, escapedChars) == -1 {
- return s
- }
- var buf bytes.Buffer
- escape(&buf, s)
- return buf.String()
- }
-
- // UnescapeString unescapes entities like "<" to become "<". It unescapes a
- // larger range of entities than EscapeString escapes. For example, "á"
- // unescapes to "á", as does "á" and "&xE1;".
- // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
- // always true.
- func UnescapeString(s string) string {
- for _, c := range s {
- if c == '&' {
- return string(unescape([]byte(s), false))
- }
- }
- return s
- }
|