import
os
import
re
from
os
import
listdir
import
jieba
from
sklearn
import
feature_extraction
from
sklearn.feature_extraction.text
import
TfidfTransformer
from
sklearn.feature_extraction.text
import
CountVectorizer
from
sklearn.cluster
import
KMeans
all_file
=
listdir(
'E:/201706120017赖志豪.txt'
)
outputDir
=
"E:/output.txt"
labels
=
[]
corpus
=
[]
size
=
200
def
buildSW():
typetxt
=
open
(
'word.txt'
)
texts
=
[
'\u3000'
,
'\n'
,
' '
]
for
word
in
typetxt:
word
=
word.strip()
texts.append(word)
return
texts
def
buildWB(texts):
for
i
in
range
(
0
,
len
(all_file)):
filename
=
all_file[i]
filelabel
=
filename.split(
'.'
)[
0
]
labels.append(filelabel)
file_add
=
'***'
+
filename
doc
=
open
(file_add,encoding
=
'utf-8'
).read()
data
=
jieba.cut(doc)
data_adj
=
''
delete_word
=
[]
for
item
in
data:
if
item
not
in
texts:
value
=
re.
compile
(r
'^[\u4e00-\u9fa5]{2,}$'
)
if
value.match(item):
data_adj
+
=
item
+
' '
else
:
delete_word.append(item)
corpus.append(data_adj)
return
corpus
def
countIdf(corpus):
vectorizer
=
CountVectorizer()
transformer
=
TfidfTransformer()
tfidf
=
transformer.fit_transform(vectorizer.fit_transform(corpus))
weight
=
tfidf.toarray()
return
weight
def
Kmeans(weight,clusters,correct):
mykms
=
KMeans(n_clusters
=
clusters)
y
=
mykms.fit_predict(weight)
result
=
[]
for
i
in
range
(
0
,clusters):
label_i
=
[]
gp
=
0
jy
=
0
xz
=
0
ty
=
0
for
j
in
range
(
0
,
len
(y)):
if
y[j]
=
=
i:
label_i.append(labels[j])
type
=
labels[j][
0
:
2
]
if
(
type
=
=
'gp'
):
gp
+
=
1
elif
(
type
=
=
'jy'
):
jy
+
=
1
elif
(
type
=
=
'xz'
):
xz
+
=
1
elif
(
type
=
=
'ty'
):
ty
+
=
1
max
=
jy
type
=
'教育'
if
(gp>jy):
max
=
gp
type
=
'股票'
if
(
max
<xz):
max
=
xz
type
=
'星座'
if
(
max
<ty):
max
=
ty
type
=
'体育'
correct[
0
]
+
=
max
result.append(
'类别'
+
'('
+
type
+
')'
+
':'
+
str
(label_i))
return
result
def
output(result,outputDir,clusters):
outputFile
=
'out'
type
=
'.txt'
count
=
0
while
(os.path.exists(outputDir
+
outputFile
+
type
)):
count
+
=
1
outputFile
=
'out'
+
str
(count)
doc
=
open
(outputDir
+
outputFile
+
type
,
'w'
)
for
i
in
range
(
0
,clusters):
print
(result[i],
file
=
doc)
print
(
'本次分类总样本数目为:'
+
str
(size)
+
' 其中正确分类数目为:'
+
str
(correct[
0
])
+
' 正确率为:'
+
str
(correct[
0
]
/
size),
file
=
doc)
doc.close()
texts
=
buildSW()
corpus
=
buildWB(texts)
weight
=
countIdf(corpus)
clusters
=
4
correct
=
[
0
]
result
=
Kmeans(weight,clusters,correct)
output(result,outputDir,clusters)
print
(
'finish'
)