# -*- coding:utf8 -*-
import os import jieba.posseg as pseg # -*- coding:utf8 -*- import os def splitSentence(inputFile,name): fin = open(inputFile, 'r') #以读的方式打开文件 print name fout= open('/home/xdj/target/'+name,'w') #以写得方式打开文件 for eachLine in fin: line = eachLine.strip().decode('utf-8', 'ignore') #去除每行首尾可能出现的空格,并转为Unicode进行处理 line=line.strip('\n') #去掉多余空行 wordList = pseg.cut(line) #用结巴分词,对每行内容进行分词 outStr = '' for word in wordList:# #print word.word,word.flag outStr += word.word+'/'+word.flag #print outStr fout.write(outStr.encode('utf-8')) #将分词好的结果写入到输出文件 fout.write('\n') fin.close() fout.close() path='/media/软件/zhuomian/VARandLDA/xuejiesourse' fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files] #fout= open('/home/xdj/myOutput.txt','w') i=-1 num=0 for f in fns: print f i=i+1 strm = '%d' %i splitSentence(f,strm) #fout.close() print num
# -*- coding:utf8 -*-import osimport jieba.posseg as pseg# -*- coding:utf8 -*-import osdef splitSentence(inputFile,name): fin = open(inputFile, 'r') #以读的方式打开文件 print name fout= open('/home/xdj/target/'+name,'w') #以写得方式打开文件 for eachLine in fin: line = eachLine.strip().decode('utf-8', 'ignore') #去除每行首尾可能出现的空格,并转为Unicode进行处理 line=line.strip('\n') #去掉多余空行 wordList = pseg.cut(line) #用结巴分词,对每行内容进行分词 outStr = '' for word in wordList:# #print word.word,word.flag outStr += word.word+'/'+word.flag #print outStr fout.write(outStr.encode('utf-8')) #将分词好的结果写入到输出文件 fout.write('\n') fin.close() fout.close()path='/media/软件/zhuomian/VARandLDA/xuejiesourse'fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files]#fout= open('/home/xdj/myOutput.txt','w') i=-1num=0for f in fns: print f i=i+1 strm = '%d' %i splitSentence(f,strm)#fout.close()print num