123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
-
- // +build ignore
-
- package main
-
- // This program generates table.go and table_test.go based on the authoritative
- // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
- //
- // The version is derived from
- // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
- // and a human-readable form is at
- // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
- //
- // To fetch a particular git revision, such as 5c70ccd250, pass
- // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
- // and -version "an explicit version string".
-
- import (
- "bufio"
- "bytes"
- "flag"
- "fmt"
- "go/format"
- "io"
- "io/ioutil"
- "net/http"
- "os"
- "regexp"
- "sort"
- "strings"
-
- "golang.org/x/net/idna"
- )
-
- const (
- // These sum of these four values must be no greater than 32.
- nodesBitsChildren = 9
- nodesBitsICANN = 1
- nodesBitsTextOffset = 15
- nodesBitsTextLength = 6
-
- // These sum of these four values must be no greater than 32.
- childrenBitsWildcard = 1
- childrenBitsNodeType = 2
- childrenBitsHi = 14
- childrenBitsLo = 14
- )
-
- var (
- maxChildren int
- maxTextOffset int
- maxTextLength int
- maxHi uint32
- maxLo uint32
- )
-
- func max(a, b int) int {
- if a < b {
- return b
- }
- return a
- }
-
- func u32max(a, b uint32) uint32 {
- if a < b {
- return b
- }
- return a
- }
-
- const (
- nodeTypeNormal = 0
- nodeTypeException = 1
- nodeTypeParentOnly = 2
- numNodeType = 3
- )
-
- func nodeTypeStr(n int) string {
- switch n {
- case nodeTypeNormal:
- return "+"
- case nodeTypeException:
- return "!"
- case nodeTypeParentOnly:
- return "o"
- }
- panic("unreachable")
- }
-
- const (
- defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat"
- gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
- )
-
- var (
- labelEncoding = map[string]uint32{}
- labelsList = []string{}
- labelsMap = map[string]bool{}
- rules = []string{}
-
- // validSuffixRE is used to check that the entries in the public suffix
- // list are in canonical form (after Punycode encoding). Specifically,
- // capital letters are not allowed.
- validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
-
- shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
- dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
-
- comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
- subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
- url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
- v = flag.Bool("v", false, "verbose output (to stderr)")
- version = flag.String("version", "", "the effective_tld_names.dat version")
- )
-
- func main() {
- if err := main1(); err != nil {
- fmt.Fprintln(os.Stderr, err)
- os.Exit(1)
- }
- }
-
- func main1() error {
- flag.Parse()
- if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
- return fmt.Errorf("not enough bits to encode the nodes table")
- }
- if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
- return fmt.Errorf("not enough bits to encode the children table")
- }
- if *version == "" {
- if *url != defaultURL {
- return fmt.Errorf("-version was not specified, and the -url is not the default one")
- }
- sha, date, err := gitCommit()
- if err != nil {
- return err
- }
- *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
- }
- var r io.Reader = os.Stdin
- if *url != "" {
- res, err := http.Get(*url)
- if err != nil {
- return err
- }
- if res.StatusCode != http.StatusOK {
- return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
- }
- r = res.Body
- defer res.Body.Close()
- }
-
- var root node
- icann := false
- br := bufio.NewReader(r)
- for {
- s, err := br.ReadString('\n')
- if err != nil {
- if err == io.EOF {
- break
- }
- return err
- }
- s = strings.TrimSpace(s)
- if strings.Contains(s, "BEGIN ICANN DOMAINS") {
- icann = true
- continue
- }
- if strings.Contains(s, "END ICANN DOMAINS") {
- icann = false
- continue
- }
- if s == "" || strings.HasPrefix(s, "//") {
- continue
- }
- s, err = idna.ToASCII(s)
- if err != nil {
- return err
- }
- if !validSuffixRE.MatchString(s) {
- return fmt.Errorf("bad publicsuffix.org list data: %q", s)
- }
-
- if *subset {
- switch {
- case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
- case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
- case s == "ao" || strings.HasSuffix(s, ".ao"):
- case s == "ar" || strings.HasSuffix(s, ".ar"):
- case s == "arpa" || strings.HasSuffix(s, ".arpa"):
- case s == "cy" || strings.HasSuffix(s, ".cy"):
- case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
- case s == "jp":
- case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
- case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
- case s == "om" || strings.HasSuffix(s, ".om"):
- case s == "uk" || strings.HasSuffix(s, ".uk"):
- case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
- case s == "tw" || strings.HasSuffix(s, ".tw"):
- case s == "zw" || strings.HasSuffix(s, ".zw"):
- case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
- // xn--p1ai is Russian-Cyrillic "рф".
- default:
- continue
- }
- }
-
- rules = append(rules, s)
-
- nt, wildcard := nodeTypeNormal, false
- switch {
- case strings.HasPrefix(s, "*."):
- s, nt = s[2:], nodeTypeParentOnly
- wildcard = true
- case strings.HasPrefix(s, "!"):
- s, nt = s[1:], nodeTypeException
- }
- labels := strings.Split(s, ".")
- for n, i := &root, len(labels)-1; i >= 0; i-- {
- label := labels[i]
- n = n.child(label)
- if i == 0 {
- if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
- n.nodeType = nt
- }
- n.icann = n.icann && icann
- n.wildcard = n.wildcard || wildcard
- }
- labelsMap[label] = true
- }
- }
- labelsList = make([]string, 0, len(labelsMap))
- for label := range labelsMap {
- labelsList = append(labelsList, label)
- }
- sort.Strings(labelsList)
-
- if err := generate(printReal, &root, "table.go"); err != nil {
- return err
- }
- if err := generate(printTest, &root, "table_test.go"); err != nil {
- return err
- }
- return nil
- }
-
- func generate(p func(io.Writer, *node) error, root *node, filename string) error {
- buf := new(bytes.Buffer)
- if err := p(buf, root); err != nil {
- return err
- }
- b, err := format.Source(buf.Bytes())
- if err != nil {
- return err
- }
- return ioutil.WriteFile(filename, b, 0644)
- }
-
- func gitCommit() (sha, date string, retErr error) {
- res, err := http.Get(gitCommitURL)
- if err != nil {
- return "", "", err
- }
- if res.StatusCode != http.StatusOK {
- return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status)
- }
- defer res.Body.Close()
- b, err := ioutil.ReadAll(res.Body)
- if err != nil {
- return "", "", err
- }
- if m := shaRE.FindSubmatch(b); m != nil {
- sha = string(m[1])
- }
- if m := dateRE.FindSubmatch(b); m != nil {
- date = string(m[1])
- }
- if sha == "" || date == "" {
- retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
- }
- return sha, date, retErr
- }
-
- func printTest(w io.Writer, n *node) error {
- fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
- fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
- for _, rule := range rules {
- fmt.Fprintf(w, "%q,\n", rule)
- }
- fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
- if err := n.walk(w, printNodeLabel); err != nil {
- return err
- }
- fmt.Fprintf(w, "}\n")
- return nil
- }
-
- func printReal(w io.Writer, n *node) error {
- const header = `// generated by go run gen.go; DO NOT EDIT
-
- package publicsuffix
-
- const version = %q
-
- const (
- nodesBitsChildren = %d
- nodesBitsICANN = %d
- nodesBitsTextOffset = %d
- nodesBitsTextLength = %d
-
- childrenBitsWildcard = %d
- childrenBitsNodeType = %d
- childrenBitsHi = %d
- childrenBitsLo = %d
- )
-
- const (
- nodeTypeNormal = %d
- nodeTypeException = %d
- nodeTypeParentOnly = %d
- )
-
- // numTLD is the number of top level domains.
- const numTLD = %d
-
- `
- fmt.Fprintf(w, header, *version,
- nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
- childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
- nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
-
- text := combineText(labelsList)
- if text == "" {
- return fmt.Errorf("internal error: makeText returned no text")
- }
- for _, label := range labelsList {
- offset, length := strings.Index(text, label), len(label)
- if offset < 0 {
- return fmt.Errorf("internal error: could not find %q in text %q", label, text)
- }
- maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
- if offset >= 1<<nodesBitsTextOffset {
- return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
- }
- if length >= 1<<nodesBitsTextLength {
- return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
- }
- labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
- }
- fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
- for len(text) > 0 {
- n, plus := len(text), ""
- if n > 64 {
- n, plus = 64, " +"
- }
- fmt.Fprintf(w, "%q%s\n", text[:n], plus)
- text = text[n:]
- }
-
- if err := n.walk(w, assignIndexes); err != nil {
- return err
- }
-
- fmt.Fprintf(w, `
-
- // nodes is the list of nodes. Each node is represented as a uint32, which
- // encodes the node's children, wildcard bit and node type (as an index into
- // the children array), ICANN bit and text.
- //
- // If the table was generated with the -comments flag, there is a //-comment
- // after each node's data. In it is the nodes-array indexes of the children,
- // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
- // nodeType is printed as + for normal, ! for exception, and o for parent-only
- // nodes that have children but don't match a domain label in their own right.
- // An I denotes an ICANN domain.
- //
- // The layout within the uint32, from MSB to LSB, is:
- // [%2d bits] unused
- // [%2d bits] children index
- // [%2d bits] ICANN bit
- // [%2d bits] text index
- // [%2d bits] text length
- var nodes = [...]uint32{
- `,
- 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
- nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
- if err := n.walk(w, printNode); err != nil {
- return err
- }
- fmt.Fprintf(w, `}
-
- // children is the list of nodes' children, the parent's wildcard bit and the
- // parent's node type. If a node has no children then their children index
- // will be in the range [0, 6), depending on the wildcard bit and node type.
- //
- // The layout within the uint32, from MSB to LSB, is:
- // [%2d bits] unused
- // [%2d bits] wildcard bit
- // [%2d bits] node type
- // [%2d bits] high nodes index (exclusive) of children
- // [%2d bits] low nodes index (inclusive) of children
- var children=[...]uint32{
- `,
- 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
- childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
- for i, c := range childrenEncoding {
- s := "---------------"
- lo := c & (1<<childrenBitsLo - 1)
- hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
- if lo != hi {
- s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
- }
- nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
- wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
- if *comments {
- fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
- c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
- } else {
- fmt.Fprintf(w, "0x%x,\n", c)
- }
- }
- fmt.Fprintf(w, "}\n\n")
- fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
- fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
- fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
- fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
- fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
- return nil
- }
-
- type node struct {
- label string
- nodeType int
- icann bool
- wildcard bool
- // nodesIndex and childrenIndex are the index of this node in the nodes
- // and the index of its children offset/length in the children arrays.
- nodesIndex, childrenIndex int
- // firstChild is the index of this node's first child, or zero if this
- // node has no children.
- firstChild int
- // children are the node's children, in strictly increasing node label order.
- children []*node
- }
-
- func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
- if err := f(w, n); err != nil {
- return err
- }
- for _, c := range n.children {
- if err := c.walk(w, f); err != nil {
- return err
- }
- }
- return nil
- }
-
- // child returns the child of n with the given label. The child is created if
- // it did not exist beforehand.
- func (n *node) child(label string) *node {
- for _, c := range n.children {
- if c.label == label {
- return c
- }
- }
- c := &node{
- label: label,
- nodeType: nodeTypeParentOnly,
- icann: true,
- }
- n.children = append(n.children, c)
- sort.Sort(byLabel(n.children))
- return c
- }
-
- type byLabel []*node
-
- func (b byLabel) Len() int { return len(b) }
- func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
- func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
-
- var nextNodesIndex int
-
- // childrenEncoding are the encoded entries in the generated children array.
- // All these pre-defined entries have no children.
- var childrenEncoding = []uint32{
- 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
- 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
- 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
- 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
- 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
- 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
- }
-
- var firstCallToAssignIndexes = true
-
- func assignIndexes(w io.Writer, n *node) error {
- if len(n.children) != 0 {
- // Assign nodesIndex.
- n.firstChild = nextNodesIndex
- for _, c := range n.children {
- c.nodesIndex = nextNodesIndex
- nextNodesIndex++
- }
-
- // The root node's children is implicit.
- if firstCallToAssignIndexes {
- firstCallToAssignIndexes = false
- return nil
- }
-
- // Assign childrenIndex.
- maxChildren = max(maxChildren, len(childrenEncoding))
- if len(childrenEncoding) >= 1<<nodesBitsChildren {
- return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
- }
- n.childrenIndex = len(childrenEncoding)
- lo := uint32(n.firstChild)
- hi := lo + uint32(len(n.children))
- maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
- if lo >= 1<<childrenBitsLo {
- return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
- }
- if hi >= 1<<childrenBitsHi {
- return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
- }
- enc := hi<<childrenBitsLo | lo
- enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
- if n.wildcard {
- enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
- }
- childrenEncoding = append(childrenEncoding, enc)
- } else {
- n.childrenIndex = n.nodeType
- if n.wildcard {
- n.childrenIndex += numNodeType
- }
- }
- return nil
- }
-
- func printNode(w io.Writer, n *node) error {
- for _, c := range n.children {
- s := "---------------"
- if len(c.children) != 0 {
- s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
- }
- encoding := labelEncoding[c.label]
- if c.icann {
- encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
- }
- encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
- if *comments {
- fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
- encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
- nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
- )
- } else {
- fmt.Fprintf(w, "0x%x,\n", encoding)
- }
- }
- return nil
- }
-
- func printNodeLabel(w io.Writer, n *node) error {
- for _, c := range n.children {
- fmt.Fprintf(w, "%q,\n", c.label)
- }
- return nil
- }
-
- func icannStr(icann bool) string {
- if icann {
- return "I"
- }
- return " "
- }
-
- func wildcardStr(wildcard bool) string {
- if wildcard {
- return "*"
- }
- return " "
- }
-
- // combineText combines all the strings in labelsList to form one giant string.
- // Overlapping strings will be merged: "arpa" and "parliament" could yield
- // "arparliament".
- func combineText(labelsList []string) string {
- beforeLength := 0
- for _, s := range labelsList {
- beforeLength += len(s)
- }
-
- text := crush(removeSubstrings(labelsList))
- if *v {
- fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
- }
- return text
- }
-
- type byLength []string
-
- func (s byLength) Len() int { return len(s) }
- func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
- func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
-
- // removeSubstrings returns a copy of its input with any strings removed
- // that are substrings of other provided strings.
- func removeSubstrings(input []string) []string {
- // Make a copy of input.
- ss := append(make([]string, 0, len(input)), input...)
- sort.Sort(byLength(ss))
-
- for i, shortString := range ss {
- // For each string, only consider strings higher than it in sort order, i.e.
- // of equal length or greater.
- for _, longString := range ss[i+1:] {
- if strings.Contains(longString, shortString) {
- ss[i] = ""
- break
- }
- }
- }
-
- // Remove the empty strings.
- sort.Strings(ss)
- for len(ss) > 0 && ss[0] == "" {
- ss = ss[1:]
- }
- return ss
- }
-
- // crush combines a list of strings, taking advantage of overlaps. It returns a
- // single string that contains each input string as a substring.
- func crush(ss []string) string {
- maxLabelLen := 0
- for _, s := range ss {
- if maxLabelLen < len(s) {
- maxLabelLen = len(s)
- }
- }
-
- for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
- prefixes := makePrefixMap(ss, prefixLen)
- for i, s := range ss {
- if len(s) <= prefixLen {
- continue
- }
- mergeLabel(ss, i, prefixLen, prefixes)
- }
- }
-
- return strings.Join(ss, "")
- }
-
- // mergeLabel merges the label at ss[i] with the first available matching label
- // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
- // "prefixLen" characters in the matching label.
- // It will merge ss[i] repeatedly until no more matches are available.
- // All matching labels merged into ss[i] are replaced by "".
- func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
- s := ss[i]
- suffix := s[len(s)-prefixLen:]
- for _, j := range prefixes[suffix] {
- // Empty strings mean "already used." Also avoid merging with self.
- if ss[j] == "" || i == j {
- continue
- }
- if *v {
- fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
- prefixLen, i, j, ss[i], ss[j], suffix)
- }
- ss[i] += ss[j][prefixLen:]
- ss[j] = ""
- // ss[i] has a new suffix, so merge again if possible.
- // Note: we only have to merge again at the same prefix length. Shorter
- // prefix lengths will be handled in the next iteration of crush's for loop.
- // Can there be matches for longer prefix lengths, introduced by the merge?
- // I believe that any such matches would by necessity have been eliminated
- // during substring removal or merged at a higher prefix length. For
- // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
- // would yield "abcde", which could be merged with "bcdef." However, in
- // practice "cde" would already have been elimintated by removeSubstrings.
- mergeLabel(ss, i, prefixLen, prefixes)
- return
- }
- }
-
- // prefixMap maps from a prefix to a list of strings containing that prefix. The
- // list of strings is represented as indexes into a slice of strings stored
- // elsewhere.
- type prefixMap map[string][]int
-
- // makePrefixMap constructs a prefixMap from a slice of strings.
- func makePrefixMap(ss []string, prefixLen int) prefixMap {
- prefixes := make(prefixMap)
- for i, s := range ss {
- // We use < rather than <= because if a label matches on a prefix equal to
- // its full length, that's actually a substring match handled by
- // removeSubstrings.
- if prefixLen < len(s) {
- prefix := s[:prefixLen]
- prefixes[prefix] = append(prefixes[prefix], i)
- }
- }
-
- return prefixes
- }
|