defcreateVocabList(dataSet): """ 获取所有单词的集合 :param dataSet: 数据集 :return: 所有单词的集合(即不含重复元素的单词列表) """ vocabSet = set([]) # create empty set for document in dataSet: # 操作符 | 用于求两个集合的并集 vocabSet = vocabSet | set(document) # union of the two sets return list(vocabSet)
defsetOfWords2Vec(vocabList, inputSet): """ 遍历查看该单词是否出现,出现该单词则将该单词置1 :param vocabList: 所有单词集合列表 :param inputSet: 输入数据集 :return: 匹配列表[0,1,0,1...],其中 1与0 表示词汇表中的单词是否出现在输入的数据集中 """ # 创建一个和词汇表等长的向量,并将其元素都设置为0 returnVec = [0] * len(vocabList)# [0,0......] # 遍历文档中的所有单词,如果出现了词汇表中的单词,则将输出的文档向量中的对应值设为1 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print"the word: %s is not in my Vocabulary!" % word return returnVec
With Jose out of town, do you want to meet once in a while to keep things going and do some interesting stuff?
Let me know Eugene
1 2 3 4 5 6 7 8 9 10 11 12
> 准备数据: 将文本文件解析成词条向量
使用正则表达式来切分文本
```python >>> mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.' >>> import re >>> regEx = re.compile('\\W*') >>> listOfTokens = regEx.split(mySent) >>> listOfTokens ['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
defbagOfWords2VecMN(vocaList, inputSet): returnVec = [0] * len(vocabList) for word in inputSet: if word in vocaList: returnVec[vocabList.index(word)] += 1 return returnVec
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#创建一个包含在所有文档中出现的不重复词的列表 defcreateVocabList(dataSet): vocabSet=set([]) #创建一个空集 for document in dataSet: vocabSet=vocabSet|set(document) #创建两个集合的并集 return list(vocabSet) defsetOfWords2VecMN(vocabList,inputSet): returnVec=[0]*len(vocabList) #创建一个其中所含元素都为0的向量 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 return returnVec
#文件解析 deftextParse(bigString): import re listOfTokens=re.split(r'\W*',bigString) return [tok.lower() for tok in listOfTokens if len(tok)>2]
#RSS源分类器及高频词去除函数 defcalcMostFreq(vocabList,fullText): import operator freqDict={} for token in vocabList: #遍历词汇表中的每个词 freqDict[token]=fullText.count(token) #统计每个词在文本中出现的次数 sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True) #根据每个词出现的次数从高到底对字典进行排序 return sortedFreq[:30] #返回出现次数最高的30个单词 deflocalWords(feed1,feed0): import feedparser docList=[];classList=[];fullText=[] minLen=min(len(feed1['entries']),len(feed0['entries'])) for i in range(minLen): wordList=textParse(feed1['entries'][i]['summary']) #每次访问一条RSS源 docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList=textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList=createVocabList(docList) top30Words=calcMostFreq(vocabList,fullText) for pairW in top30Words: if pairW[0] in vocabList:vocabList.remove(pairW[0]) #去掉出现次数最高的那些词 trainingSet=range(2*minLen);testSet=[] for i in range(20): randIndex=int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[];trainClasses=[] for docIndex in trainingSet: trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam=trainNBO(array(trainMat),array(trainClasses)) errorCount=0 for docIndex in testSet: wordVector=bagOfWords2VecMN(vocabList,docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]: errorCount+=1 print'the error rate is:',float(errorCount)/len(testSet) return vocabList,p0V,p1V
#朴素贝叶斯分类函数 defclassifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1=sum(vec2Classify*p1Vec)+log(pClass1) p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1) if p1>p0: return1 else: return0
>>> reload(bayes) <module 'bayes'from'bayes.pyc'> >>> import feedparser >>> ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') >>> sy=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') >>> vocabList,pSF,pNY=bayes.localWords(ny,sf) the error rate is: 0.2 >>> vocabList,pSF,pNY=bayes.localWords(ny,sf) the error rate is: 0.3 >>> vocabList,pSF,pNY=bayes.localWords(ny,sf) the error rate is: 0.55
为了得到错误率的精确估计,应该多次进行上述实验,然后取平均值
接下来,我们要分析一下数据,显示地域相关的用词
可以先对向量pSF与pNY进行排序,然后按照顺序打印出来,将下面的代码添加到文件中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#最具表征性的词汇显示函数 defgetTopWords(ny,sf): import operator vocabList,p0V,p1V=localWords(ny,sf) topNY=[];topSF=[] for i in range(len(p0V)): if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i])) if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i])) sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True) print"SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" for item in sortedSF: print item[0] sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True) print"NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" for item in sortedNY: print item[0]
函数 getTopWords() 使用两个 RSS 源作为输入,然后训练并测试朴素贝叶斯分类器,返回使用的概率值。然后创建两个列表用于元组的存储,与之前返回排名最高的 X 个单词不同,这里可以返回大于某个阈值的所有词,这些元组会按照它们的条件概率进行排序。
>>> reload(bayes) <module 'bayes'from'bayes.pyc'> >>> bayes.getTopWords(ny,sf) the error rate is: 0.55 SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF** how last man ... veteran still ends late off own know NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY** someone meet ... apparel recalled starting strings