mklearn
/
wxurl.py

#! -*- coding:utf-8 -*-
'''
#检索微信群精华文章
'''
import sys, os, time, uuid, re, codecs
import urllib2
import chardet
from bs4 import BeautifulSoup
from pymongo import MongoClient

reload(sys)
sys.setdefaultencoding('utf-8')

def gethtml(url):
    #headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    #req = urllib2.Request(url=url, headers=headers)
    #data = urllib2.urlopen(req).read()
    content = urllib2.urlopen(url).read()
    typeEncode = sys.getfilesystemencoding()  ##系统默认编码
    print chardet.detect(content)
    infoencode = chardet.detect(content).get('encoding', 'utf-8')  ##通过第3方模块来自动提取网页的编码
    html =content # content.decode(infoencode, 'ignore').encode("utf-8")
    #content =urllib2.urlopen(url).read().decode("gb2312").encode("utf-8") #decode("UTF-8").encode(systype) #data #.decode("UTF-8") #.encode(systype)
    print html
    return html


def parseContent(url):
    htmltext=gethtml(url)
    #htmltext = re.sub(r'&nbsp;|\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(htmltext))
    findReturn = BeautifulSoup(htmltext, "html.parser") #,from_encoding="utf-8")
    d  = findReturn.find('div',{'class' : 'article-content'})
    title = findReturn.find('h1', {'class': 'article-title'})
    foot = d.find('p', {'class': 'footnote'})
    if (not foot is None):
        foot.decompose()
    print 'content:',d #.encode("utf-8"))
    ret = d.get_text("")
    ret.strip()
    #title=findReturn.find('table').find('div',{'class':'title'}).find_next('div')
    print 'desc',str(ret.encode("utf-8"))
    print "title:",str(title)
    return (str(title.string),str(d)) #str(ret.encode("utf-8")))

if __name__ == '__main__':
    print "begin"
    gethtml("http://www.funthinker.cn:8090/h5/?from=singlemessage&isappinstalled=0#/essence/0d8889f1-6f5b-4e1b-ab62-8db3859c7102?_k=jmjp7g")