#!/usr/bin/env python #coding=utf-8 import urllib2,re from uliweb.orm import * def geturl(url): # h4 = u'http://www.baidu.com/' h4 = url url = re.search(r"://.[^/]+/",h4) if url == None: url = re.search(r"://.+",h4) print url.group() yuming = re.sub("""[:/ ]""",'',url.group()) #print yuming if yuming == None: return None yuming = "http://" + yuming + "/" h4=h4.encode("utf-8") f = urllib2.urlopen(h4,timeout=5000) buf = f.read() #print buf urls = re.findall(r"<[aA].*?href.*?>",buf) list_jue = [] list_xiang = [] for n in urls: # print n url = re.search(r"=.*?[ >]",n) #print url.group() url_box = re.sub("""[= '">]""",'',url.group()) #print url_box if url_box == '#': continue if '/' not in url_box: continue if ':' not in url_box: #l1 = yuming + '/' + url_box continue #print l1 list_jue.append(url_box) #print list_jue #print url_box for i in urls: url = re.search(r"=.*?[ >]",i) url_box1 = re.sub("""[= '">]""",'',url.group()) if 'http' in url_box1: continue if url_box1 == '#': continue if '/' not in url_box1: continue l1 = yuming + url_box1 list_xiang.append(l1) data = list_jue+list_xiang return data db = get_connection('mysql://root:root@localhost/spider?charset=utf8') class urls(Model): url = Field(str) status = Field(str) def search_url(url): n = urls.get(urls.c.url == url) return n def insert_url(url): u = search_url(url) if u: return n = urls() n.url = url n.status = "0" n.save() def get_url(): n = urls.get(urls.c.status == "0") return n def update_url(n): n = urls.get(urls.c.id == n.id) n.update(status="1") n.save() def save_newurl(url): for u in url: insert_url(u) print "add %s OK!" %(u) #db.metadata.drop_all() #db.metadata.create_all() #n = urls() #n.url = "http://v.hpcasts.com/" #n.status = "0" #n.save() while 1: new = get_url() try: url = geturl(new.url)