#!/usr/bin/env python # -*- coding:utf8 -*- # @Author : c32 (amd5@qq.com) # @Blog : http://www.19aq.com/ # @Version : # @DateTime: 2018-4-14 15:50:41 import urllib import urllib2 import MySQLdb import re # import sys conn = MySQLdb.connect(host='c32.19aq.com', user='www_yese_com', passwd='www_yese_com', db='www_yese_com', port = 3306, charset = 'utf8') headers = {'cookie':'YeSeEden=login=OK&lasttime=122%2E190%2E94%2E123&icount%5Ftime=03%2D12+23%3A32&Avatar=0&Recommended=Null&area%5Fb=%CC%EC%BD%F2%CA%D0&cjx=&area%5Fs=%BA%CD%C6%BD%C7%F8&flag=1&truemember=0&grade=1&email=&id=2496224&sex=%AEI&password=&username=pdwl; expires=Tue, 13-Mar-2018 03:32:48 GMT; path=/'} # session = requests.session() #基础信息 def main(): url = "https://www.yesejiaoyou.com/sub/Check_login.asp" data = { 'Key_Username':'c3253220', 'Key_Password':'3253220' } return post(url, data) #登陆网站 def post(url, data): req = urllib2.Request(url) data = urllib.urlencode(data) #enable cookie opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(req, data) print response.read() #打印登陆状态 1 正常 return personal() #return response.read() #返回登陆状态 #进入个人中心 def personal(): link = 'https://www.yesejiaoyou.com/personal/' url = 'https://www.yesejiaoyou.com/member/' # opener = urllib2.build_opener() # opener.addheaders.append(('Cookie', cookie)) # f = opener.open(url) req = urllib2.Request(link, headers=headers) r = urllib2.urlopen(req) html = r.read() # return html return getList(url) #获取页数 def getList(url): req = urllib2.Request(url, headers=headers) r = urllib2.urlopen(req) html = r.read() print '---------getList--------' print findPage(html) # print html # return html def findPage(html): items = re.findall('gt; >> ', html, re.S) # return myItems # print items for item in items: str = item.split('=') page = str[1] #取出页数 print '--------findPage---------' # print page print getUrl(page) def findList(html): #正则匹配列表 myItems = re.findall('
[\w\W]*(.*?)[\w\W]*(.*?)[\w\W]*
[\w\W]*
[\w\W]*(.*?)', html, re.S) print '--------findList----000-----' # print html print '--------findList----111-----' # print myItems print '--------findList----222-----' return myItems def urlPages(pages): url = 'https://www.yesejiaoyou.com/member/?page=' + str(pages) print '--------urlPages---------' return url def getListurl(url): req = urllib2.Request(url, headers=headers) data = urllib2.urlopen(req) html = data.read() # data.close() print '--------getListurl---------' return html def getUrl(page): for pages in range(1, int(page)+1): html = getListurl(urlPages(pages)) print '--------getUrl----000-----' # print html items = findList(html) print '--------getUrl----111-----' for item in items: # print item # print item[0] a = item[1].decode('gb2312','ignore').encode('utf-8') b = item[3].decode('gb2312','ignore').encode('utf-8') # print items[0] print '--------getUrl----222----' cur = conn.cursor() cur.execute('INSERT INTO list (url, name, shuxing) VALUES (%s, %s, %s)',(item[0],a,b)) conn.commit() # print pages # items = findList(html) if __name__ == '__main__': main() # personal() # page() conn.close() # def getHtml(url): #获取html源码 # page = urllib2.urlopen(url) # html = page.read() # return html # html = getHtml(url) # print html # test = logina # print logina(url,userpass,headers) # req = urllib2.Request(link) # page = urllib2.urlopen(req) # html = page.read() # print '========================' # print '你好' # print html # page.close()