dictionary.go 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. // Copyright 2013 Hui Chen
  2. // Copyright 2016 ego authors
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"): you may
  5. // not use this file except in compliance with the License. You may obtain
  6. // a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. package gse
  16. import (
  17. "github.com/go-ego/cedar"
  18. )
  19. // Dictionary 结构体实现了一个字串前缀树,
  20. // 一个分词可能出现在叶子节点也有可能出现在非叶节点
  21. type Dictionary struct {
  22. trie *cedar.Cedar // Cedar 前缀树
  23. maxTokenLen int // 词典中最长的分词
  24. tokens []Token // 词典中所有的分词,方便遍历
  25. totalFrequency int64 // 词典中所有分词的频率之和
  26. }
  27. // NewDict new dictionary
  28. func NewDict() *Dictionary {
  29. return &Dictionary{trie: cedar.New()}
  30. }
  31. // MaxTokenLen 词典中最长的分词
  32. func (dict *Dictionary) MaxTokenLen() int {
  33. return dict.maxTokenLen
  34. }
  35. // NumTokens 词典中分词数目
  36. func (dict *Dictionary) NumTokens() int {
  37. return len(dict.tokens)
  38. }
  39. // TotalFrequency 词典中所有分词的频率之和
  40. func (dict *Dictionary) TotalFrequency() int64 {
  41. return dict.totalFrequency
  42. }
  43. // addToken 向词典中加入一个分词
  44. func (dict *Dictionary) addToken(token Token) {
  45. bytes := textSliceToBytes(token.text)
  46. _, err := dict.trie.Get(bytes)
  47. if err == nil {
  48. return
  49. }
  50. dict.trie.Insert(bytes, dict.NumTokens())
  51. dict.tokens = append(dict.tokens, token)
  52. dict.totalFrequency += int64(token.frequency)
  53. if len(token.text) > dict.maxTokenLen {
  54. dict.maxTokenLen = len(token.text)
  55. }
  56. }
  57. // lookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词
  58. // 返回值为找到的分词数
  59. func (dict *Dictionary) lookupTokens(words []Text,
  60. tokens []*Token) (numOfTokens int) {
  61. var (
  62. id, value int
  63. err error
  64. )
  65. for _, word := range words {
  66. id, err = dict.trie.Jump(word, id)
  67. if err != nil {
  68. break
  69. }
  70. value, err = dict.trie.Value(id)
  71. if err == nil {
  72. tokens[numOfTokens] = &dict.tokens[value]
  73. numOfTokens++
  74. }
  75. }
  76. return
  77. }