forked from polaris1119/wordscount
-
Notifications
You must be signed in to change notification settings - Fork 0
/
count.go
149 lines (124 loc) · 3.06 KB
/
count.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package wordscount
import (
"regexp"
"strings"
"unicode"
"unicode/utf8"
"mvdan.cc/xurls/v2"
)
type Counter struct {
Total int // 总字数 = Words + Puncts
Words int // 只包含字符数
Puncts int // 标点数
Links int // 链接数
Pics int // 图片数
CodeLines int // 代码行数
}
// 统计字数,多次统计数值会累加
func (wc *Counter) Stat(str string) {
wc.Links = len(rxStrict.FindAllString(str, -1))
wc.Pics = len(imgReg.FindAllString(str, -1))
// 剔除 HTML
str = StripHTML(str)
str = AutoSpace(str)
// 普通的链接去除(非 HTML 标签链接)
str = rxStrict.ReplaceAllString(str, " ")
plainWords := strings.Fields(str)
//if str == "I'm Lily." {
// log.Printf("============== strings.Fields: %q", plainWords)
//}
for _, plainWord := range plainWords {
words := strings.FieldsFunc(plainWord, func(r rune) bool {
//if str == "I'm Lily." {
// log.Printf("============== strings.FieldsFunc: [%s][%v]%v", plainWord, r, unicode.IsPunct(r))
//}
// I'm Lily. -- I'm 应该为一个单词
// non-smoker -- 应该为一个单词
if unicode.IsPunct(r) && r != '\'' && r != '-' {
wc.Puncts++
return true
}
return false
})
for _, word := range words {
runeCount := utf8.RuneCountInString(word)
if len(word) == runeCount {
wc.Words++
} else {
wc.Words += runeCount
}
}
}
wc.Total = wc.Words + wc.Puncts
}
// 重置统计数值
func (wc *Counter) Reset() {
wc.Total = 0
wc.Words = 0
wc.Puncts = 0
wc.Links = 0
wc.Pics = 0
wc.CodeLines = 0
}
// AutoSpace 自动给中英文之间加上空格
func AutoSpace(str string) string {
out := ""
for _, r := range str {
out = addSpaceAtBoundary(out, r)
}
return out
}
func addSpaceAtBoundary(prefix string, nextChar rune) string {
if len(prefix) == 0 {
return string(nextChar)
}
r, size := utf8.DecodeLastRuneInString(prefix)
if isLatin(size) != isLatin(utf8.RuneLen(nextChar)) &&
isAllowSpace(nextChar) && isAllowSpace(r) {
return prefix + " " + string(nextChar)
}
return prefix + string(nextChar)
}
var (
rxStrict = xurls.Strict()
imgReg = regexp.MustCompile(`<img [^>]*>`)
stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n")
)
// StripHTML accepts a string, strips out all HTML tags and returns it.
func StripHTML(s string) string {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
return s
}
s = stripHTMLReplacer.Replace(s)
// Walk through the string removing all tags
b := GetBuffer()
defer PutBuffer(b)
var inTag, isSpace, wasSpace bool
for _, r := range s {
if !inTag {
isSpace = false
}
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case unicode.IsSpace(r):
isSpace = true
fallthrough
default:
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
b.WriteRune(r)
}
}
wasSpace = isSpace
}
return b.String()
}
func isLatin(size int) bool {
return size == 1
}
func isAllowSpace(r rune) bool {
return !unicode.IsSpace(r) && !unicode.IsPunct(r)
}