1 Star 0 Fork 0

hotmocha / chinawealth-spider

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
lcspider.py 13.44 KB
Copy Edit Raw Blame History
hotmocha authored 2016-08-04 12:27 . spider-init
#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import re
import cookielib
import urllib
import urllib2
import optparse
import zlib
import json
import time
import random
import HTMLParser
import json
import traceback
import socket
import errno
import datetime,calendar
import sys, os
import getopt
import MySQLdb
from MySQLdb import DatabaseError
DEBUG = None
TypeMapConstant = {
'cpid' : 'char',
'cpdjbm' : 'char',
'mjbz' : 'char',
'tzlxms' : 'char',
'cpqsrq' : 'char',
'yjkhzdnsyl' : 'double',
'yjkhzgnsyl' : 'double',
'qxms' : 'char',
'mjqsrq' : 'char',
'tzzlxms' : 'char',
'cpztms' : 'char',
'cpyjzzrq' : 'char',
'cpms' : 'char',
'fxjgms' : 'char',
'fxjgdm' : 'char',
'cpfxdj' : 'char',
'fxdjms' : 'char',
'cpqx' : 'int',
'mjjsrq' : 'char',
'qdxsje' : 'double',
'cplx' : 'char',
'cplxms' : 'char',
'cpsylx' : 'char',
'cpsylxms' : 'char',
'cpjz' : 'double',
'dqsjsyl' : 'double',
'csjz' : 'double',
'xsqy' : 'char'
}
''' 枚举的使用 '''
def enum(**enums):
return type('Enum', (), enums)
RSPSTATUS = enum(SUC =0, EMPTY = 1, TIMEOUT = 2, TOJSONERR = 3, URLLIBERR = 4, OTHERERR = 5)
def getTime():
nowtime = datetime.datetime.now()
return datetime.datetime.strftime(nowtime, "%m%d.%H%M%S")
def getNowDateAndTime():
nowtime = datetime.datetime.now()
return (datetime.datetime.strftime(nowtime, "%Y%m%d"), datetime.datetime.strftime(nowtime, "%H:%M:%S"))
def getErrnoFromE(e):
if hasattr(e, 'errno'):
return e.errno
elif e.args:
return e.args[0]
else:
return None
def logger(content):
(date, time) = getNowDateAndTime()
filename = './log/lcspider%s.log' % date
fmtContent = getTime() + ' F[' + sys._getframe().f_back.f_code.co_filename + '] L[' + str(sys._getframe().f_back.f_lineno) + ']::' + content + '\n'
with open(filename, "a+") as file:
file.write(fmtContent)
if DEBUG == True:
sys.stdout.write(fmtContent)
# RETURN( tuple ) or None: Success
def getPageInfo(pagenum = None):
baseurl = 'http://www.chinawealth.com.cn/lccpAllProJzyServlet.go'
headers = {
'Connection' : 'keep-alive',
'Accept' : 'application/json, text/javascript, */*; q=0.01',
'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-cn',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 SE 2.X MetaSr 1.0',
'Origin' : 'http://www.chinawealth.com.cn',
'Referer' : 'http://www.chinawealth.com.cn/zzlc/jsp/lccp.jsp',
'Pragma' : 'no-cache',
'Cache-Control' : 'no-cache',
};
postDict = { 'cpjglb':"", 'cpsylx':"", 'cpyzms':"", 'cpfxdj':"", 'cpqx':"", 'cpzt':"02", 'cpdjbm':"", 'cpmc':"", 'cpfxjg':"", 'mjqsrq':"", 'mjjsrq':"", 'pagenum':'2', 'areacode':"", 'code':"", 'tzzlxdm':"03" };
if pagenum != None:
postDict['pagenum'] = str(pagenum)
print baseurl, str(headers), str(postDict)
postData = urllib.urlencode(postDict);
timeout = 2
try:
req = urllib2.Request(baseurl, headers = headers, data = postData)
response = urllib2.urlopen(req, timeout = timeout)
page = response.read()
if page == None or page == '':
return (RSPSTATUS.EMPTY, None)
uhtml = page.decode('utf-8')
jo = json.loads(uhtml)
infos = []
try:
infos = jo['List']
except Exception, e:
return (RSPSTATUS.TOJSONERR, str(e));
print jo
sys.exit(1)
# success, inofs is []
return (RSPSTATUS.SUC, infos)
except socket.timeout, e:
return (RSPSTATUS.TIMEOUT, None)
except urllib2.URLError, e:
if isinstance(e.reason, socket.timeout):
return (RSPSTATUS.TIMEOUT, None)
else:
return (RSPSTATUS.URLLIBERR, str(e))
except Exception, e:
return (RSPSTATUS.OTHERERR, traceback.format_exc())
def parseCmd():
global DEBUG
opts, args = getopt.getopt(sys.argv[1:], "-d")
for op, value in opts:
if op == "-d":
DEBUG = True
def F(val):
if val == None:
return 0.0
return val
def S(val):
if val == None:
return ''
return val
def I(val):
if val == None:
return 0
return val
def D(val):
if val == None:
return ''
plusOne = 0
plusTwo = 0
if len(val) == 10:
plusOne = 1
plusTwo = 2
y = val[0 : 4]
m = val[4 + plusOne : 6 + plusOne]
d = val[6 + plusTwo: 8 + plusTwo]
return (('%s-%s-%s') % (y, m, d))
def TypeUnionTransfer(k ,v):
res = None
if TypeMapConstant[k] == 'char':
res = S(v)
elif TypeMapConstant[k] == 'double' :
if v == None or v == '':
v = '0.0'
res = float(v)
elif TypeMapConstant[k] == 'int':
if v == None or v == '':
v = '0'
res = int(v)
return res
class dbUtils(object):
@staticmethod
def mysqlConnect(mhost, muser, mpasswd, mdb, mport):
try:
conn = MySQLdb.connect(host = mhost, user= muser, passwd=mpasswd, db = mdb, port = mport, charset='utf8')
cur = conn.cursor()
return (conn, cur)
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
return (None, None)
class lcinfo(object):
def __init__(self,
cpid, cpdjbm, mjbz, tzlxms, cpqsrq, yjkhzdnsyl, yjkhzgnsyl, qxms,
mjqsrq, tzzlxms, cpztms, cpyjzzrq, cpms, fxjgms, fxjgdm, cpfxdj,
fxdjms, cpqx, mjjsrq, qdxsje, cplx, cplxms, cpsylx, cpsylxms, cpjz,
dqsjsyl, csjz, xsqy):
self.cpid = cpid
self.cpdjbm = cpdjbm
self.mjbz = mjbz
self.tzlxms = tzlxms
self.cpqsrq = cpqsrq
self.yjkhzdnsyl = yjkhzdnsyl
self.yjkhzgnsyl = yjkhzgnsyl
self.qxms = qxms
self.mjqsrq = mjqsrq
self.tzzlxms = tzzlxms
self.cpztms = cpztms
self.cpyjzzrq = cpyjzzrq
self.cpms = cpms
self.fxjgms = fxjgms
self.fxjgdm = fxjgdm
self.cpfxdj = cpfxdj
self.fxdjms = fxdjms
self.cpqx = cpqx
self.mjjsrq = mjjsrq
self.qdxsje = qdxsje
self.cplx = cplx
self.cplxms = cplxms
self.cpsylx = cpsylx
self.cpsylxms = cpsylxms
self.cpjz = cpjz
self.dqsjsyl = dqsjsyl
self.csjz = csjz
self.xsqy = xsqy
self.regdate, self.regtime = getNowDateAndTime()
self.valueTuple = (self.cpid, self.cpdjbm, self.mjbz, self.tzlxms, self.cpqsrq, self.yjkhzdnsyl, self.yjkhzgnsyl, self.qxms, self.mjqsrq, self.tzzlxms, self.cpztms, self.cpyjzzrq, self.cpms, self.fxjgms, self.fxjgdm, self.cpfxdj, self.fxdjms, self.cpqx, self.mjjsrq, self.qdxsje, self.cplx, self.cplxms, self.cpsylx, self.cpsylxms, self.cpjz, self.dqsjsyl, self.csjz, self.xsqy, self.regdate, self.regtime)
# RETURN(None:success ; Not None: error)
def insert(self, cur):
sqlCmdModel = "INSERT INTO lcinfo values(0, '%s', '%s', '%s', '%s', '%s', %0.2lf, %0.2lf, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', %d, '%s', %0.2lf, '%s', '%s', '%s', '%s', %0.2lf, %0.2lf, %0.2lf, '%s', '%s', '%s')"
sqlCmd = (sqlCmdModel % self.valueTuple)
error = None
if cur != None:
try:
cur.execute(sqlCmd)
except DatabaseError, e:
error = e
return e
except Exception, e:
error = e
return e
finally:
if error != None and getErrnoFromE(error) != 1062:
logger('insert error' + str(error))
logger('DBACTION:' + sqlCmd)
def update(self, cur):
a = self.valueTuple[2:]
b = self.valueTuple[1:2]
updateValueTuple = a + b
sqlCmdModel = "UPDATE lcinfo set mjbz = '%s', tzlxms = '%s', cpqsrq = '%s', yjkhzdnsyl = %0.2lf, yjkhzgnsyl= %0.2lf, qxms = '%s', mjqsrq = '%s', tzzlxms = '%s', cpztms = '%s', cpyjzzrq = '%s', cpms = '%s', fxjgms = '%s', fxjgdm = '%s', cpfxdj = '%s', fxdjms = '%s', cpqx = %d, mjjsrq = '%s', qdxsje = %0.2lf, cplx = '%s', cplxms = '%s', cpsylx = '%s', cpsylxms = '%s', cpjz = %0.2lf, dqsjsyl = %0.2lf, csjz = %0.2lf, xsqy = '%s', regdate = '%s', regtime = '%s' where cpdjbm = '%s'"
sqlCmd = sqlCmdModel % updateValueTuple
#logger('DBACTION:' + sqlCmd)
error = None
if cur != None:
try:
cur.execute(sqlCmd)
except DatabaseError, e:
error = e
return e
except Exception, e:
error = e
return e
finally:
if error != None:
logger('update error' + str(error))
# RETURN( tuple ) or None
def doOneSpider(pagenum):
dbOpenFlag = True
(conn, cur) = dbUtils.mysqlConnect(mhost = 'localhost', muser = 'root', mpasswd = 'root', mdb = 'lc', mport = 3306)
if conn == None or cur == None:
dbOpenFlag = False
res = getPageInfo(pagenum)
if res != None and res[0] == RSPSTATUS.SUC:
lcinfos = res[1]
for info in lcinfos:
for k, v in info.items():
if TypeMapConstant.has_key(k):
info[k] = TypeUnionTransfer(k, v)
lc = lcinfo(info['cpid'], info['cpdjbm'], info['mjbz'], info['tzlxms'], D(info['cpqsrq']), info['yjkhzdnsyl'], info['yjkhzgnsyl'],
info['qxms'], D(info['mjqsrq']), info['tzzlxms'], info['cpztms'], D(info['cpyjzzrq']), info['cpms'], info['fxjgms'],
info['fxjgdm'], info['cpfxdj'], info['fxdjms'], info['cpqx'], D(info['mjjsrq']), info['qdxsje'], info['cplx'], info['cplxms'],
info['cpsylx'], info['cpsylxms'], info['cpjz'], info['dqsjsyl'], info['csjz'], info['xsqy'])
error = lc.insert(cur)
# Duplicate, need update
if error != None and getErrnoFromE(error) == 1062:
res = lc.update(cur)
#logger('Update: ' + lc.cpdjbm)
if res == None:
if dbOpenFlag == True: conn.commit()
else:
logger('Update Error: ' + str(e))
cur.close()
conn.close()
return res
# error
elif error != None:
logger('Insert: ' + lc.cpdjbm + ' error')
cur.close()
conn.close()
return error
# success
else:
if dbOpenFlag == True:
conn.commit()
#getPageInfo error
else:
logger("ErrorCode: " + str(res[0]) + " ErrorDesc: " + str(res[1]))
cur.close()
conn.close()
return res
def doSpider():
for pagenum in range(1, 10000):
logger('pagenum start: ' + str(pagenum))
tryTime = 3
nextPageFlag = True
while tryTime > 0:
res = doOneSpider(pagenum)
# success
if res == None:
logger('pagenum end: ' + str(pagenum))
break
elif res[0] == -1 or res[0] == RSPSTATUS.EMPTY:
logger('pagenum stop: ' + str(pagenum))
nextPageFlag = False
break;
elif res[0] in (RSPSTATUS.TIMEOUT, RSPSTATUS.URLLIBERR):
logger('timeout let me sleep')
time.sleep(random.randint(30, 60))
tryTime = tryTime - 1
else:
logger('pagenum: ' + str(pagenum) + ' error: ' + str(res))
nextPageFlag = False
break
if nextPageFlag == False:
break;
def spider():
while True:
(d, t) = getNowDateAndTime()
if (t > '00:00:00' and t < '12:00:00') or (t > '23:00:00' and t < '24:00:00'):
logger('Not in worker time, Sleep!!')
time.sleep(5 * 60)
continue
logger(' ---- <<<<<<<<<<<<<<<< spider start >>>>>>>>>>>>>> ---- ')
doSpider()
# 15 min
logger(' --- sleep 15 minutes --- ')
time.sleep(15 * 60)
logger(' ---- >>>>>>>>>>>>>>> spider end <<<<<<<<<<<<<<<<< ---- ')
def testDB1():
lc = lcinfo('1079256', 'C1080316000941', '人民币(CNY)', '债券类', '2016-08-02', 4.3, 4.3, '1-3个月(含)', '2016/07/28',
'一般个人客户', '在售', '2016/09/19', '2016汇富计划698期', '天津银行股份有限公司', 'C10803', '02', '(中低)',
48, '2016/08/01', 50000, '02', '值型', '03', '收益', 0, 0, 0, '')
(conn, cur) = dbUtils.mysqlConnect(mhost = 'localhost', muser = 'root', mpasswd = 'root', mdb = 'lc', mport = 3306)
if conn == None or cur == None:
logger('db error')
lc.insert(cur)
res = lc.update(cur)
conn.commit()
def daemonize():
try:
pid = os.fork()
if pid > 0:
# in parent
sys.exit(0)
except OSError, e:
logger('fork error')
sys.exit(1)
os.umask(0)
os.setsid()
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError, e:
logger('fork error')
sys.exit(1)
if __name__ == '__main__':
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
parseCmd()
#daemonize()
spider()
Python
1
https://gitee.com/hotmocha/chinawealth-spider.git
git@gitee.com:hotmocha/chinawealth-spider.git
hotmocha
chinawealth-spider
chinawealth-spider
master

Search