下載特定區域內百度街景照片數據

下載特定區域內百度街景照片數據

本文是在康博的博文的基礎上再整理的。

01 下載路網數據

基本上都是使用 Open Street Map (OSM) 的路網數據。下載 OSM 數據的方法有很多,這裏就不再贅述。

我個人是使用 OpenStreetMap Data Extracts 下載了整個中國的數據,然後用研究區的邊界數據進行了裁剪。具體是使用了 ArcGIS 的 (ArcToolbox - Analysis Tools - Extract - Clip) 工具進行的裁剪。

02 對道路進行採點

使用 Ian Broad 開發的 Create Points on Polylines with ArcPy 工具箱進行採點。可以去工具箱的鏈接下載 tbx 文件,然後把文件複製到

C:\Users\Ivy\AppData\Roaming\ESRI\Desktop10.5\ArcToolbox\My Toolboxes

文件夾即可,在 Catalog 看到新下載的工具箱(上面的路徑需要自己改一下,特別是用戶名!)。

運行 CreatePointsLines 工具,設置參數,類型可以選“INTERVAL BY DISTANCE",然後 Distance 字段填 0.00001。這樣大概就是每條路間隔 1 米取一個點。

然後在屬性表裏新建經緯度的字段

然後對字段進行【計算幾何】的操作,把經緯度給算出來。

(康博的博文是把座標系轉換爲了 wgs84 的 web 墨卡託座標,但是我驗證了一下,這個墨卡託座標和百度使用的平面墨卡託座標不是一致的,所以這裏還是先輸出 wgs84 的座標,後面使用百度的 api 轉換爲 bd09mc。)

然後把屬性表導出爲 txt 文件。(不要導出爲 dbf 文件,轉換還挺麻煩的,用 python 的話需要額外的庫,用 excel 的話數據量太大會打不開(excel 有行數限制))

接着把導出的點使用下面的代碼存入數據庫,我使用的是 mongodb。

import pandas as pd
import pymongo

df = pd.read_csv("../maps/points.txt")

new_df = df[["lon", "lat"]] # 只要座標列
new_df = new_df[(new_df["lon"] != 0) & (new_df["lat"] != 0)] # 刪除計算錯誤的點

new_df["wgs84"] = new_df["lon"].map(str)+","+new_df["lat"].map(str)
new_df["ok"] = 0
del new_df["lon"], new_df["lat"]
print(new_df.shape)
# 把讀出來的 dataframe 轉化爲 dict,形式爲[{"wgs84": "x,y",  "ok": 0}, ...]
data = new_df.to_dict(orient = 'records')

# 存入數據庫
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["area"]
col = db["rpoints_wgs84"]

col.insert_many(data)

數據庫內的形式大概是如下所示。ok 字段是記錄狀態的。因爲當點很多的時候,可能由於意外導致程序崩潰或者打斷,而不知道程序的進度來重新啓動。

03 WGS84 轉換爲百度墨卡託座標

使用百度官方的 api 把 wgs84 座標系轉換爲百度墨卡託座標。

需要使用百度的開發者 ak,我是創建了一個形如下面這樣的 json 文件來存儲可用的 ak,方便程序調用。

{
    "ak": [
        "ak1",
        "ak2"
    ]
}

轉換座標的代碼如下。因爲 api 一次可以接收 100 個點,所以用到了均分列表的函數。

import requests, json
import pymongo
import sys, traceback
import random, time
from tqdm import tqdm

def convert_points(point):
    # 每次 100 個點
    coords = ";".join(point)
    url = "http://api.map.baidu.com/geoconv/v1/?coords={}&from=1&to=6&ak={}".format(coords, random.choice(aks))
    while True:
        try:
            res = requests.get(url)
            data = res.json()

            if data["status"] == 0:
                return data["result"]
            else:
                print(data)
        except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
            print("\n Error: ", repr(e))
        except:
            print("\n ************************ Alert!! ********************************")
            traceback.print_exc()
            return False

def write_data(mc_point):
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["area"]
    col = db["rpoints_mc"]

    data = []
    for point in mc_point:
        data.append({"bd09mc": "{},{}".format(point["x"], point["y"]), "ok": 0})

    now = time.time()
    print("Ready to insert data")
    while True:
        try:
            if len(data) <= 1000:
                col.insert_many(data)
            else:
                groups = split_list(data, n=1000)
                for group in groups:
                    col.insert_many(group)
            print("Inserting data used", time.time()-now,"s")
            break
        except:
            traceback.print_exc()
            sys.exit(1)

def split_list(l, n=100):
    # 均分列表
    new_l = []
    for i in range(0,len(l),n):
        new_l.append(l[i:i+n])
    return new_l

def get_points():
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["area"]
    col = db["rpoints_wgs84"]

    docs = col.find({"ok": 0})
    points = [doc["wgs84"] for doc in docs]
    return points

if __name__=="__main__":
    ## 讀取出點座標
    wgs_points = get_points()
    points_groups = split_list(wgs_points)
    print("There're", len(points_groups), "groups")

    ## 讀取出備用的 ak
    with open("ak.json", 'r', encoding='utf8') as f:
        j = json.load(f)
    aks = j["ak"]

    valid_points = []
    for wgs_point in tqdm(points_groups, ncols=80):
        while True:
            mc_point = convert_points(wgs_point)
            if mc_point:
                valid_points += mc_point
                break
        
    write_data(valid_points)

04 獲取全景圖 id

這時,使用的是一個非官方的鏈接來訪問這個座標點是否存在全景圖(不同座標點可能存在同一全景圖)。

https://mapsv0.bdimg.com/?qt=qsdata&x={}&y={}

返回的是 json 數據,含有全景圖的 id。

使用下面的代碼多線程爬取全景圖 id,並存入數據庫,這裏仍然使用的是 mongdb。

import pymongo
import threading
from queue import Queue
from threading import Thread
import requests, urllib3
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys, traceback
from fake_useragent import UserAgent # 構造假的 Headers
from ippool.ippool_fast import * # 自己寫的代理池的包
import time
import random

class Spider():
    def __init__(self):
        self.thread_num = 10
        self.start = time.time()
        self.all_panoids = self.get_panoids()
        
        # 各種隊列
        self.point_q = self.get_points() # 待確認的點的隊列
        self.all_lenth = self.point_q.qsize() # 隊列總長度
        self.finish_q = Queue(100000) # 已經確認的點的隊列
        self.panoid_q = Queue() # 全景圖id隊列
        self.ippool_q = Queue() # 代理池隊列

    def get_points(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["rpoints_mc"]

        docs = col.find({"ok": 0})

        point_q = Queue()
        for doc in docs:
            point_q.put(doc["bd09mc"])
        return point_q
    
    def get_panoids(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = col.find()
        if docs:
            print(len(list(docs)))
            panoids = [doc["panoid"] for doc in docs]
            return panoids
        return []
    
    def writeData(self, panoids):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = []
        for panoid in panoids:
            docs.append({"panoid": panoid,
                        "bd09mc": "",
                        "bd09ll": "",
                        "wgs84": "",
                        "date": "",
                        "ok": 0,
                        "info": {}})
        col.insert_many(docs)
    
    def req(self, url):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "DNT": "1",
            "Host": "mapsv0.bdimg.com",
            "Connection": "close"
        }

        # 訪問 url 直到得到正確的訪問結果
        while True:
            time.sleep(random.random()*2)
            try:
                # 獲取 fake_UA
                headers["User-Agent"] = UserAgent(path="fake_useragent.json").random
                # 獲取代理
                while True:
                    if self.ippool_q.empty() is False:
                        proxy_ip = self.ippool_q.get()
                        break
                    else:
                        time.sleep(5)

                with requests.Session() as s:
                    retry = Retry(connect=3, backoff_factor=0.5)
                    adapter = HTTPAdapter(max_retries=retry)
                    s.mount("https://", adapter)
                    s.keep_alive = False
                    res = s.get(url, headers=headers, proxies=proxy_ip, timeout=10)
                data = res.json()

                # 把可用的代理再放回代理池
                self.ippool_q.put(proxy_ip)

                if data["result"]["error"] == 0:
                    return data["content"]["id"]
                elif data["result"] == {"action":0, "error": 404}:
                    return False
                else:
                    print(data)
                    print(url)
                    sys.exit(1)
            except (requests.exceptions.ConnectTimeout, urllib3.exceptions.ReadTimeoutError, requests.exceptions.ProxyError, urllib3.exceptions.MaxRetryError, requests.exceptions.ReadTimeout) as e:
                # print("************************ Alert!! ********************************")
                # print("Error:", repr(e))
                pass
            except:
                print("************************ Alert!! ********************************")
                traceback.print_exc()
                print("url: ", url)

    # 線程1: 訪問 url 獲取結果
    def producer(self):
        print("producer thread started.")
        while self.point_q.empty() is False:
            point = self.point_q.get()
            point_ls = point.split(",")
            url = "https://mapsv0.bdimg.com/?qt=qsdata&x={}&y={}".format(point_ls[0], point_ls[1])
            panoid = self.req(url)
            if (panoid) and (panoid not in self.all_panoids):
                self.panoid_q.put(panoid)
            
            while True:
                if self.finish_q.full():
                    print("finish_q is full")
                    time.sleep(10)
                else:
                    self.finish_q.put(point)
                    break
            print("Processing: {}/{}, panoid got: {}, time elapse: {:.0f}s, points waiting for update: {}".format(self.all_lenth-self.point_q.qsize(), self.all_lenth, len(self.all_panoids), time.time()-self.start, self.finish_q.qsize()))
    
    # 線程2: 線程1返回的結果存入數據庫
    def consumer(self):
        print("consumer thread started.")
        while (self.point_q.empty() is False) or (self.panoid_q.empty() is False):
            if (self.point_q.empty() is True):
                lenth = self.panoid_q.qsize()
                tmp_panoids = []
                for _ in range(lenth):
                    tmp_panoids.append(self.panoid_q.get())
                valid_panoids = list(set(tmp_panoids) - set(self.all_panoids))
                if len(valid_panoids) > 0:
                    self.all_panoids += valid_panoids
                    self.writeData(valid_panoids)
            else:
                pass
    
    # 線程3: 更新爬過的點的數據庫
    def update(self):
        print("update thread started.")
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["rpoints_mc"]

        while (self.point_q.empty() is False) or (self.finish_q.empty() is False):
            if self.finish_q.qsize() >= 100:
                tmp_points = []
                for _ in range(100):
                    tmp_points.append(self.finish_q.get())

                result = col.update_many(
                    {"bd09mc": {"$in": tmp_points}},
                    {"$set": {"ok": 1}}
                )

                print("update result: ", result.matched_count, result.modified_count, result.raw_result)
            elif self.point_q.empty() is True:
                lenth = self.finish_q.qsize()
                tmp_points = []
                for _ in range(lenth):
                    tmp_points.append(self.finish_q.get())

                result = col.update_many(
                    {"bd09mc": {"$in": tmp_points}},
                    {"$set": {"ok": 1}}
                )

                print("update result: ", result.matched_count, result.modified_count, result.raw_result)
            else:
                pass
        print("All data updated")
    
    # 線程4: 爬取代理
    def get_ips(self):
        self.ippool_q.put({"https": ""})
        print("ippool thread started. There're {} proxies.".format(self.ippool_q.qsize()))
        while (self.point_q.empty() is False) or (self.finish_q.empty() is False):
            if self.ippool_q.empty() is True:
                while True:
                    tmp_ippool = buildippool()
                    if len(tmp_ippool) > 5:
                        for ip in tmp_ippool:
                            self.ippool_q.put(ip)
                        break

    def run(self):
        ths =[]

        ippool_thread = Thread(target=self.get_ips)
        ippool_thread.start()
        ths.append(ippool_thread)

        for _ in range(self.thread_num):
            producer_thread = Thread(target=self.producer)
            producer_thread.start()
            ths.append(producer_thread)
        
        consumer_thread = Thread(target=self.consumer)
        consumer_thread.start()
        ths.append(consumer_thread)
        
        update_thread = Thread(target=self.update)
        update_thread.start()
        ths.append(update_thread)

        # 阻塞主線程
        for th in ths:
            th.join()

        print("Time consume:", time.time()-self.start)  

if __name__ == '__main__':
    Spider().run()

得到的數據大概是這樣的

05 獲取全景圖的元數據

如果只需要下載所有的圖片,可以跳過這一步。

訪問的仍然是一個非官方 api 的鏈接。一次可以獲取 50 個點的數據,但是有時候可能會因爲太多了而崩潰返回 404,所以代碼裏設置瞭如果出錯就拆分爲 25 個點一組的機制。

import pymongo
from queue import Queue
from threading import Thread
import requests, urllib3
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys, traceback
from fake_useragent import UserAgent # 構造假的 Headers
from ippool.ippool_fast import *
import time
import random

class Spider():
    def __init__(self):
        self.thread_num = 5
        self.start = time.time()
        
        # 各種隊列
        self.panoid_q = self.get_panoid_groups()# 待確認的全景圖id隊列
        self.all_lenth = self.panoid_q.qsize() # 隊列總長度
        self.process_q = Queue()
        self.finish_q = Queue() # 已經確認的隊列
        self.ippool_q = Queue() # 代理池隊列

        # 設置一個 flag 控制線程結束
        # flag=0,所有線程運行;flag=1,線程1結束,爬完了所有的點,此時線程5也可以結束;
        # flag=2,線程2結束,處理完了所有的點;
        self.flag = 0

    def split_list(self, l, n=100):
        # 均分列表
        new_l = []
        for i in range(0,len(l),n):
            new_l.append(l[i:i+n])
        return new_l
    
    def get_panoid_groups(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = col.find({"ok": 0})
        if docs:
            panoids = [doc["panoid"] for doc in docs]
            groups = self.split_list(panoids, 50)
            panoid_q = Queue()
            for group in groups:
                panoid_q.put(group)
            return panoid_q
        return []
    
    def req(self, url):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "DNT": "1",
            "Host": "mapsv0.bdimg.com",
            "Connection": "close"
        }

        # 訪問 url 直到得到正確的訪問結果
        while True:
            time.sleep(random.random()*2)
            try:
                # 獲取 fake_UA
                headers["User-Agent"] = UserAgent(path="fake_useragent.json").random
                # 獲取代理
                while True:
                    if self.ippool_q.empty() is False:
                        proxy_ip = self.ippool_q.get()
                        break
                    else:
                        time.sleep(5)

                # with requests.Session() as s:
                #     retry = Retry(connect=3, backoff_factor=0.5)
                #     adapter = HTTPAdapter(max_retries=retry)
                #     s.mount("https://", adapter)
                #     s.keep_alive = False
                res = requests.get(url, headers=headers, proxies=proxy_ip, timeout=10)
                data = res.json()

                # 把可用的代理再放回代理池
                self.ippool_q.put(proxy_ip)

                if data["result"]["error"] == 0:
                    return data["content"]
                elif data["result"]["error"] == 400:
                    return False
                else:
                    print(data)
                    print(url)
                    sys.exit(1)
            except (requests.exceptions.ConnectTimeout, urllib3.exceptions.ReadTimeoutError, requests.exceptions.ProxyError, urllib3.exceptions.MaxRetryError, requests.exceptions.ReadTimeout) as e:
                print("************************ Alert!! ********************************")
                print("Error: TimeoutError or proxyError")
            except:
                print("************************ Alert!! ********************************")
                traceback.print_exc()
                print("url: ", url)

    # 線程1: 訪問 url 獲取結果
    def producer(self):
        print("producer thread started.")
        while self.panoid_q.empty() is False:
            group = self.panoid_q.get()
            part = 1 # 因爲有時傳入太多panoid會導致返回400的錯誤,所以可能需要分兩組
            while True:
                if part == 1:
                    url = "https://mapsv0.bdimg.com/?qt=sdata&sid={}".format(";".join(group))
                    data = self.req(url)
                    if data:
                        self.process_q.put(data)
                        break
                    else:
                        print("split to part 2")
                        part = 2
                        continue
                else:
                    url = "https://mapsv0.bdimg.com/?qt=sdata&sid={}".format(";".join(group[:25]))
                    data1 = self.req(url)
                    url = "https://mapsv0.bdimg.com/?qt=sdata&sid={}".format(";".join(group[25:]))
                    data2 = self.req(url)
                    data = data1 + data2
                    self.process_q.put(data)
        self.flag = 1
        print("producer thread done")
    
    # 線程2: 線程1返回的結果進行處理
    def processor(self):
        print("processor thread stared")
        while self.flag < 2:
            if self.process_q.empty() is False:
                data = self.process_q.get()
                for row in data:
                    self.finish_q.put({
                        "id": row["ID"],
                        "info": {
                            "date": row["Date"],
                            "bd09mc": "{},{}".format(row["X"],row["Y"]),
                            "info": row
                        }
                    })

            elif self.flag == 1:
                # 此時線程1爬取完畢,線程2也處理完畢了
                time.sleep(5) # 休息 5s 等待是否還有數據
                if self.process_q.empty() is True:
                    self.flag = 2
            else:
                pass
        print("processor thread done")
    
    # 線程3: 線程2解析的結果存入數據庫
    def consumer(self):
        print("consumer thread started.")

        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        while self.flag < 4:
            if self.finish_q.empty() is False:
                data = self.finish_q.get()
                while True:
                    result = col.update_one({
                        "panoid": data["id"]
                    },
                    {
                        "$set": {
                            "bd09mc": data["info"]["bd09mc"],
                            "date": data["info"]["date"],
                            "ok": 1,
                            "info": data["info"]["info"]
                        }
                    })
                    # print("update result: ", result.matched_count, result.modified_count, result.raw_result)
                    if result.modified_count == 1:
                        # print("update success.")
                        break
                    else:
                        print(result.raw_result)
            elif self.flag == 3:
                self.flag += 1
            else:
                pass
        print("consumer thread done")

    # 線程4: 爬取代理
    def get_ips(self):
        print("ippool thread start")
        self.ippool_q.put({"https": ""})

        while self.flag < 1:
            if self.ippool_q.empty() is True:
                while True:
                    tmp_ippool = buildippool()
                    if len(tmp_ippool) > 5:
                        for ip in tmp_ippool:
                            self.ippool_q.put(ip)
                        break
        print("ippool thread done")
    
    # 線程5: 進度條顯示
    def pbar(self):
        state = [0, 0, 0, 0]
        while self.flag < 4:
            if [self.panoid_q.qsize(), self.process_q.qsize(), self.finish_q.qsize()] != state:
                state = [self.panoid_q.qsize(), self.process_q.qsize(), self.finish_q.qsize()]
                print("Processing: {}/{}, time elapse: {:.0f}s, waiting for process: {}, waiting for update: {}".format(self.all_lenth-self.panoid_q.qsize(), self.all_lenth, time.time()-self.start, self.process_q.qsize(), self.finish_q.qsize()))
                time.sleep(1)

    def run(self):
        print("There're {} groups.".format(self.all_lenth))
        ths =[]

        pbar_thread = Thread(target=self.pbar)
        pbar_thread.start()
        ths.append(pbar_thread)

        ippool_thread = Thread(target=self.get_ips)
        ippool_thread.start()
        ths.append(ippool_thread)

        for _ in range(self.thread_num):
            producer_thread = Thread(target=self.producer)
            producer_thread.start()
            ths.append(producer_thread)
        
        processor_thread = Thread(target=self.processor)
        processor_thread.start()
        ths.append(processor_thread)

        consumer_thread = Thread(target=self.consumer)
        consumer_thread.start()
        ths.append(consumer_thread)

        # 阻塞主線程
        for th in ths:
            th.join()

        print("Time consume:", time.time()-self.start, "s")  

if __name__ == '__main__':
    Spider().run()

這樣就得到了全景圖的一些諸如具體座標點,採集時間等詳細元數據。

06 下載全景圖

訪問的仍然是一個非官方 api 的鏈接。(官方 api 一天免費下載的次數太少了!

https://mapsv0.bdimg.com/?qt=pr3d&fovy={}&quality={}&panoid={}&heading={}&pitch={}&width={}&height={}

其中一些參數的含義如下:

  • fovy:焦距,0 - 100
  • quanlity: 圖片質量(分辨率),1 - 100
  • panoid:全景圖 id
  • heading:水平角度,0 - 360
  • pitch:俯仰角度,1 - 100
  • height: 圖片高度,0 - 1024
  • width:圖片寬度,0 - 512
import pymongo
import threading
from queue import Queue
from threading import Thread
import requests, urllib3
import sys, traceback
from fake_useragent import UserAgent # 構造假的 Headers
from ippool.ippool_fast import *
import time, os
import random

class Spider():
    def __init__(self):
        self.thread_num = 1
        self.start = time.time()
        
        # 各種隊列
        self.panoid_q = self.get_panoids()# 待下載的全景圖id隊列
        self.all_lenth = self.panoid_q.qsize() # 隊列總長度
        self.finish_q = Queue() # 已經確認的隊列
        self.ippool_q = Queue() # 代理池隊列

    def get_panoids(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = col.find({"ok": 1})
        if docs:
            panoid_q = Queue()
            for doc in docs:
                panoid_q.put(doc["panoid"] )
            return panoid_q
        return []
    
    def req(self, panoid, heading):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "DNT": "1",
            "Host": "mapsv0.bdimg.com"
            # "Connection": "close"
        }

        url = "https://mapsv0.bdimg.com/?qt=pr3d&fovy=75&quality=100&panoid={}&heading={}&pitch=0&width=1024&height=512".format(panoid, heading)
        
        # 訪問 url 直到得到正確的訪問結果
        while True:
            time.sleep(random.random()*2)
            try:
                # 獲取 fake_UA
                headers["User-Agent"] = UserAgent(path="fake_useragent.json").random
                # 獲取代理
                while True:
                    if self.ippool_q.empty() is False:
                        proxy_ip = self.ippool_q.get()
                        break
                    else:
                        time.sleep(5)

                img_path = "../imgs/{}-{}.jpg".format(panoid,heading)

                res = requests.get(url, stream=True, timeout=20)
                if res.status_code == 200:
                    with open(img_path, "wb") as f:
                        f.write(res.content)
                    # print(img_path, "saved")

                    # 檢查文件是否完整
                    file_size = int(res.headers["content-length"])
                    if os.path.getsize(img_path) != file_size:
                        os.remove(img_path)
                        continue

                    # 把可用的代理再放回代理池
                    self.ippool_q.put(proxy_ip)
                    return
                else:
                    print(res.status_code)
                    print(url)
            except (requests.exceptions.ConnectTimeout, urllib3.exceptions.ReadTimeoutError, requests.exceptions.ProxyError, urllib3.exceptions.MaxRetryError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
                print("************************ Alert!! ********************************")
                print("Error:", repr(e))
            except:
                print("************************ Alert!! ********************************")
                traceback.print_exc()
                print("url: ", url)

    # 線程1: 訪問 url 下載圖片
    def producer(self):
        print("producer thread started.")
        while self.panoid_q.empty() is False:
            panoid = self.panoid_q.get()
            headings = [0, 90, 180, 270]
            for heading in headings:
                self.req(panoid, heading)
            self.finish_q.put(panoid)

        print("producer thread done")
    
    # 線程2: 更新數據庫
    def consumer(self):
        print("processor thread stared")

        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        while (self.panoid_q.empty() is False) or (self.finish_q.empty() is False):
            if self.finish_q.empty() is False:
                panoid = self.finish_q.get()
                col.update_one({
                    "panoid": panoid
                },
                {
                    "$set":{
                        "ok": 2
                    }
                })

        print("processor thread done")

    # 線程3: 爬取代理
    def get_ips(self):
        print("ippool thread start")
        self.ippool_q.put({"https": ""})

        while self.panoid_q.empty() is False:
            if self.ippool_q.empty() is True:
                while True:
                    tmp_ippool = buildippool()
                    if len(tmp_ippool) > 5:
                        for ip in tmp_ippool:
                            self.ippool_q.put(ip)
                        break
        print("ippool thread done")
    
    # 線程4: 進度條顯示
    def pbar(self):
        while (self.panoid_q.empty() is False) or (self.finish_q.empty() is False):
            print("Processing: {}/{}, time elapse: {:.0f}s, waiting for update: {}".format(self.all_lenth-self.panoid_q.qsize(), self.all_lenth, time.time()-self.start, self.finish_q.qsize()))
            time.sleep(1)

    def run(self):
        ths =[]

        pbar_thread = Thread(target=self.pbar)
        pbar_thread.start()
        ths.append(pbar_thread)

        ippool_thread = Thread(target=self.get_ips)
        ippool_thread.start()
        ths.append(ippool_thread)

        for _ in range(self.thread_num):
            producer_thread = Thread(target=self.producer)
            producer_thread.start()
            ths.append(producer_thread)

        consumer_thread = Thread(target=self.consumer)
        consumer_thread.start()
        ths.append(consumer_thread)

        # 阻塞主線程
        for th in ths:
            th.join()

        print("Time consume:", time.time()-self.start, "s")  

if __name__ == '__main__':
    Spider().run()

07 百度墨卡託座標轉百度09座標

後面這些步驟都是豐富元數據的,如不需要元數據可跳過。

這裏使用的是官方 api

import requests, json
import pymongo
from queue import Queue
from threading import Thread
import numpy as np
import time, random
import traceback

class Spider():
    def __init__(self):
        self.thread_num = 1
        self.start = time.time()

        self.groups_q = self.get_points()
        self.lenth = self.groups_q.qsize()
        self.aks_q = self.get_aks()

        self.valid_q = Queue()

    def get_aks(self):
        with open("ak.json", 'r', encoding='utf8') as f:
            j = json.load(f)
        aks = j["ak"]
        aks_q = Queue()
        for ak in aks:
            aks_q.put(ak)
        return aks_q
    
    def split_list(self, l, n=100):
        # 均分列表
        new_l = []
        for i in range(0,len(l),n):
            new_l.append(l[i:i+n])
        return new_l
    
    def get_points(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = col.find({"ok": 2})
        groups_q = Queue()
        if docs:
            points = [[doc["panoid"], doc["bd09mc"]] for doc in docs]
            point_groups = self.split_list(points)
            for group in point_groups:
                groups_q.put(group)
            return groups_q
        return groups_q

    def req(self, group):
        base = "http://api.map.baidu.com/geoconv/v1/?coords={}&from=6&to=5&ak={}"

        panoids = []
        bd09mc = []
        for point in group:
            panoids.append(point[0])
            lon, lat = point[1].split(",")
            # 之前返回的結果裏的 XY 座標是*100的,所以這裏要縮放一下
            bd09mc.append("{},{}".format(int(lon)/100, int(lat)/100))

        # 訪問 url 直到得到正確的訪問結果
        while True:
            time.sleep(random.random()*2)
            try:
                ak = self.aks_q.get()
                url = base.format(";".join(bd09mc), ak)
                res = requests.get(url)
                data = res.json()

                if data["status"] == 0:
                    # 把有用的 ak 放回隊列
                    self.aks_q.put(ak)

                    for i in range(len(data["result"])):
                        self.valid_q.put({
                            "panoid": panoids[i],
                            "bd09mc": bd09mc[i],
                            "bd09ll": "{},{}".format(data["result"][i]["x"],data["result"][i]["y"])
                        })
                    break
                else:
                    print(data)
            except Exception as e:
                # print("\n Error: ", repr(e))
                traceback.print_exc()

    # 線程1: 訪問 url 獲取結果
    def producer(self):
        while self.groups_q.empty() is False:
            group = self.groups_q.get()
            self.req(group)
    
    # 線程2:將線程1的結果存入數據庫
    def write(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        while (self.groups_q.empty() is False) or (self.valid_q.empty() is False):
            if self.valid_q.empty() is False:
                data = self.valid_q.get()
                col.update_one({
                    "panoid": data["panoid"]
                },
                {
                    "$set":{
                        "bd09mc": data["bd09mc"],
                        "bd09ll": data["bd09ll"],
                        "ok": 3
                    }
                })
    
    # 線程3: 進度條顯示
    def pbar(self):
        state = [0, 0]
        while (self.groups_q.empty() is False) or (self.valid_q.empty() is False):
            if [self.lenth-self.groups_q.qsize(), self.valid_q.qsize()] != state:
                state = [self.lenth-self.groups_q.qsize(), self.valid_q.qsize()]
                print("Processing: {}/{}, time elapse: {:.0f}s, waiting for update: {}".format(self.lenth-self.groups_q.qsize(), self.lenth, time.time()-self.start, self.valid_q.qsize()))
                time.sleep(1)
                
    def run(self):
        ths =[]

        pbar_thread = Thread(target=self.pbar)
        pbar_thread.start()
        ths.append(pbar_thread)

        for _ in range(self.thread_num):
            producer_thread = Thread(target=self.producer)
            producer_thread.start()
            ths.append(producer_thread)

        write_thread = Thread(target=self.write)
        write_thread.start()
        ths.append(write_thread)

        # 阻塞主線程
        for th in ths:
            th.join()
        
        print("Time consume:", time.time()-self.start, "s")  

if __name__ == '__main__':
    Spider().run()

08 百度09轉WGS84

使用的是別人寫好的代碼作爲基礎(忘記出處了~

import math

PI = math.pi
PIX = math.pi * 3000 / 180
EE = 0.00669342162296594323
A = 6378245.0


def bd09_to_gcj02(lng, lat):
    """BD09 -> GCJ02"""
    x, y =  lng - 0.0065, lat - 0.006
    z = math.sqrt(x * x + y * y) - 0.00002 * math.sin(y * PIX)
    theta = math.atan2(y, x) - 0.000003 * math.cos(x * PIX)
    lng, lat = z * math.cos(theta), z * math.sin(theta)
    return lng, lat


def gcj02_to_bd09(lng, lat):
    """GCJ02 -> BD09"""
    z = math.sqrt(lng * lng + lat * lat) + 0.00002 * math.sin(lat * PIX)
    theta = math.atan2(lat, lng) + 0.000003 * math.cos(lng * PIX)
    lng, lat = z * math.cos(theta) + 0.0065, z * math.sin(theta) + 0.006
    return lng, lat


def gcj02_to_wgs84(lng, lat):
    """GCJ02 -> WGS84"""
    if out_of_china(lng, lat):
        return lng, lat
    dlat = transform_lat(lng - 105.0, lat - 35.0)
    dlng = transform_lng(lng - 105.0, lat - 35.0)
    radlat = lat / 180.0 * PI
    magic = math.sin(radlat)
    magic = 1 - EE * magic * magic
    sqrtmagic = math.sqrt(magic)
    dlat = (dlat * 180.0) / ((A * (1 - EE)) / (magic * sqrtmagic) * PI)
    dlng = (dlng * 180.0) / (A / sqrtmagic * math.cos(radlat) * PI)
    lng, lat = lng - dlng, lat - dlat
    return lng, lat


def wgs84_to_gcj02(lng, lat):
    """WGS84 -> GCJ02"""
    if out_of_china(lng, lat):
        return lng, lat
    dlat = transform_lat(lng - 105.0, lat - 35.0)
    dlng = transform_lng(lng - 105.0, lat - 35.0)
    radlat = lat / 180.0 * PI
    magic = math.sin(radlat)
    magic = 1 - EE * magic * magic
    sqrtmagic = math.sqrt(magic)
    dlat = (dlat * 180.0) / ((A * (1 - EE)) / (magic * sqrtmagic) * PI)
    dlng = (dlng * 180.0) / (A / sqrtmagic * math.cos(radlat) * PI)
    lng, lat = lng + dlng, lat + dlat
    return lng, lat


def mapbar_to_wgs84(lng, lat):
    """MapBar -> WGS84"""
    lng = lng * 100000.0 % 36000000
    lat = lat * 100000.0 % 36000000
    lng1 = int(lng - math.cos(lat / 100000.0) * lng / 18000.0 - math.sin(lng / 100000.0) * lat / 9000.0) 
    lat1 = int(lat - math.sin(lat / 100000.0) * lng / 18000.0 - math.cos(lng / 100000.0) * lat / 9000.0)
    lng2 = int(lng - math.cos(lat1 / 100000.0) * lng1 / 18000.0 - math.sin(lng1 / 100000.0) * lat1 / 9000.0 + (1 if lng > 0 else -1))
    lat2 = int(lat - math.sin(lat1 / 100000.0) * lng1 / 18000.0 - math.cos(lng1 / 100000.0) * lat1 / 9000.0 + (1 if lat > 0 else -1)) 
    lng, lat = lng2 / 100000.0, lat2 / 100000.0
    return lng, lat


def transform_lat(lng, lat):
    """GCJ02 latitude transformation"""
    ret = -100 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
    ret += (20.0 * math.sin(6.0 * lng * PI) + 20.0 * math.sin(2.0 * lng * PI)) * 2.0 / 3.0
    ret += (20.0 * math.sin(lat * PI) + 40.0 * math.sin(lat / 3.0 * PI)) * 2.0 / 3.0
    ret += (160.0 * math.sin(lat / 12.0 * PI) + 320.0 * math.sin(lat * PI / 30.0)) * 2.0 / 3.0
    return ret


def transform_lng(lng, lat):
    """GCJ02 longtitude transformation"""
    ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
    ret += (20.0 * math.sin(6.0 * lng * PI) + 20.0 * math.sin(2.0 * lng * PI)) * 2.0 / 3.0
    ret += (20.0 * math.sin(lng * PI) + 40.0 * math.sin(lng / 3.0 * PI)) * 2.0 / 3.0
    ret += (150.0 * math.sin(lng / 12.0 * PI) + 300.0 * math.sin(lng / 30.0 * PI)) * 2.0 / 3.0
    return ret


def out_of_china(lng, lat):
    """No offset when coordinate out of China."""
    if lng < 72.004 or lng > 137.8437:
        return True
    if lat < 0.8293 or lat > 55.8271:
        return True
    return False


def bd09_to_wgs84(lng, lat):
    """BD09 -> WGS84"""
    lng, lat = bd09_to_gcj02(lng, lat)
    lng, lat = gcj02_to_wgs84(lng, lat)
    return lng, lat


def wgs84_to_bd09(lng, lat):
    """WGS84 -> BD09"""
    lng, lat = wgs84_to_gcj02(lng, lat)
    lng, lat = gcj02_to_bd09(lng, lat)
    return lng, lat


def mapbar_to_gcj02(lng, lat):
    """MapBar -> GCJ02"""
    lng, lat = mapbar_to_wgs84(lng, lat)
    lng, lat = wgs84_to_gcj02(lng, lat)
    return lng, lat


def mapbar_to_bd09(lng, lat):
    """MapBar -> BD09"""
    lng, lat = mapbar_to_wgs84(lng, lat)
    lng, lat = wgs84_to_bd09(lng, lat)
    return lng, lat


if __name__ == '__main__':
    blng, blat = 121.4681891220,31.1526609317
    print('BD09:', (blng, blat))
    print('BD09 -> GCJ02:', bd09_to_gcj02(blng, blat))
    print('BD09 -> WGS84:',bd09_to_wgs84(blng, blat))
    wlng, wlat = 121.45718237717077, 31.14846209914084
    print('WGS84:', (wlng, wlat))
    print('WGS84 -> GCJ02:', wgs84_to_gcj02(wlng, wlat))
    print('WGS84 -> BD09:', wgs84_to_bd09(wlng, wlat))
    mblng, mblat = 121.4667323772, 31.1450420991
    print('MapBar:', (mblng, mblat))
    print('MapBar -> WGS84:', mapbar_to_wgs84(mblng, mblat))
    print('MapBar -> GCJ02:', mapbar_to_gcj02(mblng, mblat))
    print('MapBar -> BD09:', mapbar_to_bd09(mblng, mblat))

然後在自己的代碼裏引入converter.py

import pymongo
from queue import Queue
from threading import Thread
import math
import time, random
import traceback
from converter import *

class Spider():
    def __init__(self):
        self.thread_num = 5
        self.start = time.time()

        self.points_q = self.get_points()
        self.lenth = self.points_q.qsize()

        self.valid_q = Queue()
    
    def get_points(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        docs = col.find({"ok": 3})
        points_q = Queue()
        if docs:
            points = [[doc["panoid"], doc["bd09ll"]] for doc in docs]
            for point in points:
                points_q.put(point)
            return points_q
        return points_q

    # 線程1: 轉換座標
    def producer(self):
        while self.points_q.empty() is False:
            panoid, point = self.points_q.get()
            lon, lat = point.split(",")
            lon, lat = float(lon), float(lat)
            nlon, nlat = gcj02_to_wgs84(lon, lat)
            self.valid_q.put({
                "panoid": panoid,
                "wgs84": "{},{}".format(nlon, nlat)
            })
    
    # 線程2:將線程1的結果存入數據庫
    def write(self):
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        db = client["area"]
        col = db["streetview"]

        while (self.points_q.empty() is False) or (self.valid_q.empty() is False):
            if self.valid_q.empty() is False:
                data = self.valid_q.get()
                col.update_one({
                    "panoid": data["panoid"]
                },
                {
                    "$set":{
                        "wgs84": data["wgs84"],
                        "ok": 4
                    }
                })
    
    # 線程3: 進度條顯示
    def pbar(self):
        state = [0, 0]
        while (self.points_q.empty() is False) or (self.valid_q.empty() is False):
            if [self.lenth-self.points_q.qsize(), self.valid_q.qsize()] != state:
                state = [self.lenth-self.points_q.qsize(), self.valid_q.qsize()]
                print("Processing: {}/{}, time elapse: {:.0f}s, waiting for update: {}".format(self.lenth-self.points_q.qsize(), self.lenth, time.time()-self.start, self.valid_q.qsize()))
                time.sleep(1)
                
    def run(self):
        ths =[]

        pbar_thread = Thread(target=self.pbar)
        pbar_thread.start()
        ths.append(pbar_thread)

        for _ in range(self.thread_num):
            producer_thread = Thread(target=self.producer)
            producer_thread.start()
            ths.append(producer_thread)

        write_thread = Thread(target=self.write)
        write_thread.start()
        ths.append(write_thread)

        # 阻塞主線程
        for th in ths:
            th.join()
        
        print("Time consume:", time.time()-self.start, "s")  

if __name__ == '__main__':
    Spider().run()

下載全部完成

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章