backup:alexa query

#!/usr/bin/env python

import sys
import urllib
import re
import Queue
import threading
import time
import socket

RANK = re.compile(r'<img src="/images/icons/globe-sm.jpg" alt="Global" style="margin-bottom:-2px;"/>([^<]+)</div>')
CHINA = re.compile(r'<img class="dynamic-icon"[^>]+"China Flag"/>([^<]+)</div>')
WWW = re.compile(r'^www\.')
def get_alexa(domain):
    d = WWW.sub('', domain)
    html = urllib.urlopen('http://www.alexa.com/siteinfo/' + d).read()
    rt = RANK.findall(html)
    output = domain + ','
    if len(rt):
        output += rt[0].strip().replace(',', '')
    else:
        output += '-'
    output += ','
    rt = CHINA.findall(html)
    if len(rt):
        output += rt[0].strip().replace(',', '')
    else:
        output += '-'
    output += '\n'
    sys.stdout.write(output)


class Alexa(threading.Thread):
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue
    
    def run(self):
        while True:
            domain = self.queue.get()
            try:
                get_alexa(domain)
            except socket.error:
                time.sleep(2)
                self.queue.put(domain)
                self.queue.task_done()
                continue
            except:
                self.queue.task_done()
                continue
            self.queue.task_done()

queue = Queue.Queue()
for i in xrange(20):
    t = Alexa(queue)
    t.setDaemon(True)
    t.start()

for line in open('./input.txt'):
    queue.put(line.strip())
queue.join()

加入讨论

电子邮件地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据