代码拉取完成,页面将自动刷新
#! -*- coding:utf-8 -*-
'''
#检索微信群精华文章
'''
import sys, os, time, uuid, re, codecs
import urllib2
import chardet
from bs4 import BeautifulSoup
from pymongo import MongoClient
reload(sys)
sys.setdefaultencoding('utf-8')
def gethtml(url):
#headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
#req = urllib2.Request(url=url, headers=headers)
#data = urllib2.urlopen(req).read()
content = urllib2.urlopen(url).read()
typeEncode = sys.getfilesystemencoding() ##系统默认编码
print chardet.detect(content)
infoencode = chardet.detect(content).get('encoding', 'utf-8') ##通过第3方模块来自动提取网页的编码
html =content # content.decode(infoencode, 'ignore').encode("utf-8")
#content =urllib2.urlopen(url).read().decode("gb2312").encode("utf-8") #decode("UTF-8").encode(systype) #data #.decode("UTF-8") #.encode(systype)
print html
return html
def parseContent(url):
htmltext=gethtml(url)
#htmltext = re.sub(r' |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(htmltext))
findReturn = BeautifulSoup(htmltext, "html.parser") #,from_encoding="utf-8")
d = findReturn.find('div',{'class' : 'article-content'})
title = findReturn.find('h1', {'class': 'article-title'})
foot = d.find('p', {'class': 'footnote'})
if (not foot is None):
foot.decompose()
print 'content:',d #.encode("utf-8"))
ret = d.get_text("")
ret.strip()
#title=findReturn.find('table').find('div',{'class':'title'}).find_next('div')
print 'desc',str(ret.encode("utf-8"))
print "title:",str(title)
return (str(title.string),str(d)) #str(ret.encode("utf-8")))
if __name__ == '__main__':
print "begin"
gethtml("http://www.funthinker.cn:8090/h5/?from=singlemessage&isappinstalled=0#/essence/0d8889f1-6f5b-4e1b-ab62-8db3859c7102?_k=jmjp7g")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。