python實現的多線程驗證代理程序

網上找到的exe版本的代理驗證程序,多數被報毒,不少也都掛了馬,還是python安全,在網上python驗證程序的基礎上修改了,在此記錄一下,以備後用。

基本思路是通過正則表達式解析網上公佈的匿名代理,然後啓用多線程驗證代理可用性,並形成列表。

驗證有兩種方式,一種是直接使用代理去連接相應的網頁(暫未驗證urllib2的proxy handler在多線程下的可行性),另外一種是直接使用socket去connect代理服務器,默認使用的是第一種。

代碼如下:

# coding:utf-8
# Nova

import sys
import urllib2
import re
import socket
import time
import threading


THREAD_COUNT  = 50
CONN_TIME_OUT = 5
VERIFY_BY_SOCKET = False


proxy_sources = [
  {
    'url': 'http://www.xici.net.co/wn/', 
    'pattern': r'\W*\W*\W*\W*([\.\d]+)\W*(\d+)\W*\W*\W**\W*\W*(.*?)',
    'foreign': True
  },
  {
    'url':  'http://www.xici.net.co/nn/',
    'pattern': r'\W*\W*\W*\W*(.*)\W*(.*)\W*\W*.*\W\W*\W*.*\W*\W*(.*)\W*',
    'foreign': False
  }
]

# proxies got from proxy site
proxies = []

# result, only record the proxy which verified ok.
result =[]

lock = threading.Lock()

def synchronized(fun):
  def call(*args, **kwargs):
    lock.acquire()
    try:
      return fun(*args, **kwargs)
    finally:
      lock.release()
  return call


# synchronized get proxy from proxies
@synchronized
def get_proxy():
    global proxies

    print '%s%s%s%3d' % ('\b'*12, 'remain:    ', '\b'*3, len(proxies)),   # 控制格式化打印輸出
    if len(proxies)>0:
        return proxies.pop()
    else:
        return None

    
# synchronized save result
@synchronized
def add_result(proxy):
    global result

    if proxy not in result:
        result.append(proxy)

def fetch_proxies(src):
  print 'fetching proxy from %s ...' % src['url'],

  req = urllib2.Request(src['url'])
  rsp = urllib2.urlopen(req)
  rs = re.compile(src['pattern']).findall(rsp.read())

  i = 0
  for r in rs:
    proxy = {}
    proxy['ip'] = r[0]    # proxy ip
    proxy['port'] = r[1]  # proxy port
    proxy['type'] = r[2]  # proxy type: HTTP, HTTPS, SOCK5
    proxy['foreign'] = src['foreign']
    proxy['time'] = 0     # connect speed, refresh then

    if proxy not in proxies:
      proxies.append(proxy)
      i = i+1

  print '%d proxies parsed.' % i


def verify_by_socket(proxy):
  ret = proxy

  # print "checking %s %s ... " % (proxy['ip'], proxy['port']),
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  sock.settimeout(3)

  try:
      start = time.clock()

      #連接代理服務器
      sock.connect((proxy['ip'], int(proxy['port'])))
      proxy['time'] = int((time.clock() - start) * 1000)

      # print '%d ms, ok.' % proxy['time'],
  except Exception, e:
      # print 'failed:', e,
      ret = None
  finally:
      # print ''
      sock.close()
      return ret


def verify_by_http(proxy):
  ret = proxy

  if proxy['foreign'] is True:
    url = 'http://www.google.com'
  else:
    url = 'http://www.baidu.com'

  req = urllib2.Request(url)
  str_proxy = '%(ip)s:%(port)s' % proxy
  req.set_proxy(str_proxy,'http')
  try:
    start = time.clock()
    conn = urllib2.urlopen(req, timeout=5)
    conn.read()
    conn.close()
  except Exception, e:
    ret = None
  finally:
    return ret


def verify_proxies():
  while 1:
    proxy = get_proxy()

    if proxy is None:
      break

    if VERIFY_BY_SOCKET:
      ret = verify_by_socket(proxy)
    else:
      ret = verify_by_http(proxy)

    if ret is not None:
      add_result(proxy)

def serialize_result():
  str_proxies = ''
  for r in result:
    str_proxies += '%(ip)s:%(port)s %(type)s %(foreign)s %(time)d\n' % r
  return str_proxies

def save_result(fname, str_rst):
  f = open(fname, 'w')
  f.write(str_rst)
  f.close()
  print 'result saved to %s' % fname


if __name__ == '__main__':
  print 'start verifying proxy, press Ctrl+Break to break.'
  for src in proxy_sources:
    fetch_proxies(src)

  # init thread_pool 
  thread_pool = []

  for i in range(THREAD_COUNT): 
    th = threading.Thread(target=verify_proxies, args=()) ; 
    thread_pool.append(th)

  # start threads one by one         
  for thread in thread_pool: 
    thread.start()

  # collect all threads 
  for thread in thread_pool: 
    threading.Thread.join(thread)

  result.sort(lambda x, y: cmp(x['time'], y['time']))

  print ''
  str_rst = serialize_result()
  print ' -------- result -------- '
  print str_rst

  if len(sys.argv) > 1:
    save_result(sys.argv[1], str_rst)



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章