自動抓取並解析一個商品頁

  • 以美國adidas官網爲例。
  • 輸入url,抓取商品信息(標題、描述、圖片等);抓取屬性信息(顏色、尺碼、價格、庫存、skuId)。
  • 思路很簡單,就是打開頁面,分析各個需要內容的標籤。

獲取頁面

public static Document getHttpPostResponseWithDocument(String url, String referrer, List<NameValuePair> params,                                                  DecompressingHttpClient httpClient) throws IOException {
        HttpResponse response = getHttpPostResponse(url, referrer, params, httpClient);
        Document doc = Jsoup.parse(EntityUtils.toString(response.getEntity(), "UTF-8"));
        EntityUtils.consume(response.getEntity());
        return doc;
    }

public static HttpResponse getHttpGetResponse(String url, String referrer, DecompressingHttpClient httpClient) throws IOException {
        HttpGet get = new HttpGet(url);
        setHeaders(get);
        if (!StringUtils.isBlank(referrer)) {
            get.setHeader("Referer", referrer);
        }
        return httpClient.execute(get);
    }

判斷是否有貨

public boolean isInStock() {
        Elements addToCartElements = doc.select(".addtocart");
        if(null == addToCartElements || addToCartElements.isEmpty()) {
            return false;
        }
        if(!addToCartElements.toString().contains("add-to-cart-button")) {
            return false;
        }
        return true;
    }

顏色獲取

public ExecInfo parse(String url, Map<String, String> colorMap) {

        ExecResult<Document> execResult = getOneSkuInfoPage(url);
        if (!execResult.isSucc()) {
           LogUtils.info(execResult.getMsg());
        }
        if(!isInStock()) {
            LogUtils.info("out of stock!");
            return ExecInfo.fail("out of stock!");
        }
        Elements curColorElements = doc.select(".product-color");
        if(null == curColorElements || curColorElements.isEmpty()) {
            return ExecInfo.fail("獲取當前商品顏色信息失敗");
        } else {
            Pattern COLOR_PATTERN = Pattern.compile("<span class=\"product-color-clear\">([^<]*)</span>");
            Pattern SKU_PATTERN = Pattern.compile("\\(([0-9A-Za-z]*)\\)");
            Matcher color_matcher = COLOR_PATTERN.matcher(curColorElements.toString());
            Matcher sku_matcher = SKU_PATTERN.matcher(curColorElements.toString());
            if(color_matcher.find() && sku_matcher.find()) {
                LogUtils.info("CURRENT COLOR: " + sku_matcher.group(1) + ", " + color_matcher.group(1));
            }
        }
        //Elements elements = doc.select("#colorVariationsCarousel");
        Elements elements = doc.select(".color-variation-row");
        if(null != elements && !elements.isEmpty()) {
            for (Element element : elements) {
                Elements colorElements = element.select(".color-variations-thumb-color");
                for (Element colorElement : colorElements) {
                    //LogUtils.info(colorElement.toString());
                    Pattern SKU_PATTERN = Pattern.compile("data-articleno=\"([0-9A-Za-z]*)");
                    Pattern TITLE_PATTERN = Pattern.compile("title=\"([^\"]*)");
                    Matcher sku_matcher = SKU_PATTERN.matcher(colorElement.toString());
                    Matcher title_matcher = TITLE_PATTERN.matcher(colorElement.toString());
                    if (sku_matcher.find() && title_matcher.find()) {
                        colorMap.put(sku_matcher.group(1), title_matcher.group(1));
                    }
                }
            }
        }
        LogUtils.info(colorMap.toString());
        return ExecInfo.succ();

    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章