2 Star 9 Fork 5

守望者 / mklearn

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
wxurl.py 1.88 KB
一键复制 编辑 原始数据 按行查看 历史
守望者 提交于 2016-12-02 19:01 . New changelist
#! -*- coding:utf-8 -*-
'''
#检索微信群精华文章
'''
import sys, os, time, uuid, re, codecs
import urllib2
import chardet
from bs4 import BeautifulSoup
from pymongo import MongoClient
reload(sys)
sys.setdefaultencoding('utf-8')
def gethtml(url):
#headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
#req = urllib2.Request(url=url, headers=headers)
#data = urllib2.urlopen(req).read()
content = urllib2.urlopen(url).read()
typeEncode = sys.getfilesystemencoding() ##系统默认编码
print chardet.detect(content)
infoencode = chardet.detect(content).get('encoding', 'utf-8') ##通过第3方模块来自动提取网页的编码
html =content # content.decode(infoencode, 'ignore').encode("utf-8")
#content =urllib2.urlopen(url).read().decode("gb2312").encode("utf-8") #decode("UTF-8").encode(systype) #data #.decode("UTF-8") #.encode(systype)
print html
return html
def parseContent(url):
htmltext=gethtml(url)
#htmltext = re.sub(r' |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(htmltext))
findReturn = BeautifulSoup(htmltext, "html.parser") #,from_encoding="utf-8")
d = findReturn.find('div',{'class' : 'article-content'})
title = findReturn.find('h1', {'class': 'article-title'})
foot = d.find('p', {'class': 'footnote'})
if (not foot is None):
foot.decompose()
print 'content:',d #.encode("utf-8"))
ret = d.get_text("")
ret.strip()
#title=findReturn.find('table').find('div',{'class':'title'}).find_next('div')
print 'desc',str(ret.encode("utf-8"))
print "title:",str(title)
return (str(title.string),str(d)) #str(ret.encode("utf-8")))
if __name__ == '__main__':
print "begin"
gethtml("http://www.funthinker.cn:8090/h5/?from=singlemessage&isappinstalled=0#/essence/0d8889f1-6f5b-4e1b-ab62-8db3859c7102?_k=jmjp7g")
Python
1
https://gitee.com/househou/mklearn.git
git@gitee.com:househou/mklearn.git
househou
mklearn
mklearn
master

搜索帮助