#!/usr/bin/env python
#coding=utf-8
 
import urllib2,re
from uliweb.orm import *
 
def geturl(url):
#   h4 = u'http://www.baidu.com/'
    h4 = url
 
    url = re.search(r"://.[^/]+/",h4)
    if url == None:
        url = re.search(r"://.+",h4)
    print url.group()
    yuming = re.sub("""[:/ ]""",'',url.group())
#print yuming
    if yuming == None:
        return None
    yuming = "http://" + yuming + "/"
    h4=h4.encode("utf-8")
 
    f = urllib2.urlopen(h4,timeout=5000)
    buf = f.read()
 
#print buf
    urls = re.findall(r"<[aA].*?href.*?>",buf)
 
    list_jue = []
    list_xiang = []
 
    for n in urls:
#   print n
        url = re.search(r"=.*?[ >]",n)
    #print url.group()
        url_box = re.sub("""[= '">]""",'',url.group())
    #print url_box
        if url_box == '#':
            continue
        if '/' not in url_box:
            continue
        if ':' not in url_box:
            #l1 = yuming + '/' + url_box
            continue
            #print l1
        list_jue.append(url_box)
        #print list_jue
        #print url_box
    for i in urls:
        url = re.search(r"=.*?[ >]",i)
        url_box1 = re.sub("""[= '">]""",'',url.group())
     
        if 'http' in url_box1:
            continue
        if url_box1 == '#':
            continue
        if '/' not in url_box1:
            continue
        l1 = yuming + url_box1
        list_xiang.append(l1)
 
 
    data = list_jue+list_xiang
    return data
 
db = get_connection('mysql://root:root@localhost/spider?charset=utf8')
 
class urls(Model):
    url = Field(str)
    status = Field(str)
 
def search_url(url):
    n = urls.get(urls.c.url == url)
    return n
 
def insert_url(url):
    u = search_url(url)
    if u:
        return
    n = urls()
    n.url = url
    n.status = "0"
    n.save()
 
def get_url():
    n = urls.get(urls.c.status == "0")
    return n
 
def update_url(n):
    n = urls.get(urls.c.id == n.id)
    n.update(status="1")
    n.save()
 
def save_newurl(url):
    for u in url:
        insert_url(u) 
        print "add %s OK!" %(u)
#db.metadata.drop_all()
#db.metadata.create_all()
#n = urls()
#n.url = "http://v.hpcasts.com/"
 
#n.status = "0"
#n.save()
 
while 1:
    new = get_url()
    try:
        url = geturl(new.url)