webmagic實戰使用

一.引入依賴包

<dependency>
  <groupId>us.codecraft</groupId>
  <artifactId>webmagic-core</artifactId>
  <version>0.4.3</version>
  </dependency>
  <dependency>
  <groupId>us.codecraft</groupId>
  <artifactId>webmagic-extension</artifactId>
  <version>0.4.3</version>
  </dependency>

二.代碼

package com.pz998.quartz.spider;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.eclipse.jetty.util.MultiMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;
import com.pz998.rpc.model.entity.BdDepartmentRpc;
import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;
import com.pz998.rpc.model.entity.BdDoctorRpc;
import com.pz998.rpc.model.entity.BdHospitalRpc;

import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
import net.minidev.json.parser.JSONParser;
import net.minidev.json.parser.ParseException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.xsoup.Xsoup;

public class YiBaiduProcessor implements PageProcessor{

private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";

private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";

private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";

private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";

private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";

private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";

private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId=\\d+&hosId=\\d+";

private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";
//https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83
//https://yi.baidu.com/pc/hospital/alldep?key=
private Site site = Site.me();

public static final String STATE_SUCCESS = "0";

public static final Map<String,String> CITY_MAP = new HashMap<String,String>();

static{
CITY_MAP.put("371","武漢");
CITY_MAP.put("1", "北京");
CITY_MAP.put("2", "上海");
CITY_MAP.put("84","廣州");
}
@Override
public void process(Page page) {
String url=page.getUrl().toString();
if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){
try{
String state = new JsonPathSelector("$.status").select(page.getRawText());
if(STATE_SUCCESS.equals(state)){
List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText());
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String cityId = resultMap.getString("cityId");
if(CollectionUtils.isNotEmpty(hospitalList)){
List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>();
for(Object obj:hospitalList){
JSONObject jsonObj = (JSONObject)obj;
String name = (String)jsonObj.get("name");
System.out.println("name:"+name);
String address = (String)jsonObj.get("address");
String level = (String)jsonObj.get("level");
Integer insurance = (Integer)jsonObj.get("insurance");
String phone = (String)jsonObj.get("phone");
String grade = (String)jsonObj.get("grade");
Integer doctorNum = (Integer)jsonObj.get("doctorNum");
String imageUrl = (String)jsonObj.get("logo");
Integer serveNum = (Integer)jsonObj.get("serveNum");
Integer commentNum = (Integer)jsonObj.get("commentNum");
String routeLink = (String)jsonObj.get("routeLink");

MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink);
String location = routeLinkMap.getString("location");
String latitude = "";
String longitude = "";
if(StringUtils.isNotEmpty(location)){
String[] locationArray = location.split(",");
latitude = locationArray.length>0?locationArray[0]:"";
longitude = locationArray.length>1?locationArray[1]:"";
}
BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
bdHospitalRpc.setSourceId(name);
bdHospitalRpc.setName(name);
bdHospitalRpc.setAddress(address);
bdHospitalRpc.setLevel(level);
bdHospitalRpc.setPhone(phone);
bdHospitalRpc.setImageUrl(imageUrl);
bdHospitalRpc.setLatitude(latitude);
bdHospitalRpc.setLongitude(longitude);
bdHospitalRpc.setScore(grade);
String city = CITY_MAP.get(cityId);
bdHospitalRpc.setCity(city);
String insuranceStr = insurance==null?"":insurance.toString();
bdHospitalRpc.setIsMedicalInsurance(insuranceStr);
String doctorNumStr = doctorNum==null?"":doctorNum.toString();
bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr);

String serveNumStr = serveNum==null?"":serveNum.toString();
bdHospitalRpc.setFinishedServiceNum(serveNumStr);

String commentNumStr=commentNum==null?"":commentNum.toString();
bdHospitalRpc.setPatientCommentNum(commentNumStr);
bdHospitalList.add(bdHospitalRpc);

String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name;
String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name;
page.addTargetRequest(infoUrl);
page.addTargetRequest(allDeptUrl);
}

page.putField("bdHospitalList", bdHospitalList);
}
}
}catch(Exception e){
e.printStackTrace();
}
}else
if(page.getUrl().regex(HOSPITAL_INFO_URL).match()){
try{
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
  String hosName = resultMap.getString("key");
  BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all();
if(CollectionUtils.isNotEmpty(contextList)){
String context1 = contextList.size()>=1?contextList.get(0):"";
String context2 = contextList.size()>=2?contextList.get(1):"";
String context3 = contextList.size()>=3?contextList.get(2):"";
String context4 = contextList.size()>=4?contextList.get(3):"";
String context5 = contextList.size()>=5?contextList.get(4):"";

bdHospitalRpc.setContent(context1);
bdHospitalRpc.setHistory(context2);
bdHospitalRpc.setCharacteristicDept(context3);
bdHospitalRpc.setTeam(context4);
bdHospitalRpc.setHonor(context5);
// System.out.println("醫院概況:"+context1);
// System.out.println("歷史沿革:"+context2);
// System.out.println("特色科室:"+context3);
// System.out.println("醫護團隊:"+context4);
// System.out.println("醫院榮譽:"+context5);
}

bdHospitalRpc.setSourceId(hosName);
page.putField("bdHospitalRpc", bdHospitalRpc);

}catch(Exception e){
e.printStackTrace();
}
}else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){
try{
  MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
  String hosName = resultMap.getString("key");
  String topDepts = "";
  List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all();
List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>();
for(String html:tableHtml){
Document document = Jsoup.parse(html);
String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get();
List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list();
List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list();

//重點科室信息
if(StringUtils.isEmpty(platDept)){
topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames);
//醫院科室信息
}else{
for(String d:hospitalDepts){
Document deptDocument = Jsoup.parse(d);
String deptName = Xsoup.select(deptDocument, "a/text()").get();
String deptHref = Xsoup.select(deptDocument, "a/@href").get();
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref);
String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");
BdDepartmentRpc bdDepart = new BdDepartmentRpc();
bdDepart.setSourceId(deptId);
bdDepart.setName(deptName);
bdDepart.setParentSource(platDept);
bdDepart.setHospitalSource(hosName);
departmentList.add(bdDepart);
//將科室詳情地址放入目標採集隊列
page.addTargetRequest(deptHref);
//將科室下醫生列表鏈接放入隊列
for(int i=1;i<6;i++){
String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId="+deptId+"&hosId="+hosId;
page.addTargetRequest(doctorUrl);
}

}
}
}

BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
bdHospitalRpc.setSourceId(hosName);
System.out.println("重點科室:"+topDepts);
bdHospitalRpc.setCharacteristicFaculty(topDepts);
page.putField("hosTopDept", bdHospitalRpc);
page.putField("departmentList", departmentList);

// System.out.println(page.getHtml().toString());
}catch(Exception e){
e.printStackTrace();
}
//採集科室信息
}
else if(page.getUrl().regex(DEPT_INFO_URL).match()){
String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString();
String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString();

MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
  String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");

BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc();
bdDepartmentRpc.setAddress(deptAddress);
bdDepartmentRpc.setPhone(deptPhone);
bdDepartmentRpc.setContent(content);
bdDepartmentRpc.setSourceId(deptId);
bdDepartmentRpc.setTitleDescr(titleDescr);
page.putField("bdDepartmentRpc", bdDepartmentRpc);

}else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){
String status = new JsonPathSelector("$.status").select(page.getRawText());
if(STATE_SUCCESS.equals(status)){
String data = new JsonPathSelector("$.data[*]").select(page.getRawText());
if(data!=null){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");
String pageNum = deptResultMap.getString("page");
List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>();
JSONParser jsonParser = new JSONParser();
JSONObject dataJo = null;
try {
dataJo = (JSONObject)jsonParser.parse(data);
} catch (ParseException e) {
e.printStackTrace();
}

if("1".equals(pageNum)){
JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList");
if(CollectionUtils.isNotEmpty(diseaseArray)){
JSONObject obj = (JSONObject)diseaseArray.get(0);
JSONArray diseaseList = (JSONArray)obj.get("list");
if(CollectionUtils.isNotEmpty(diseaseList)){
for(Object disease:diseaseList){
JSONObject diseaseJo=(JSONObject)disease;
String itemName = (String)diseaseJo.get("itemName");
if("全部".equals(itemName)){
continue;
}
BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc();
bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId);
bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId);
bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName);
BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc);
}
}

}
}

page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList);

if(dataJo.containsKey("doctorList")){
List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
if(CollectionUtils.isNotEmpty(doctorList)){

//收集醫生信息
List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>();
//收集醫生與疾病關係信息
List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>();
for(Object o:doctorList){
JSONObject doctorJo = (JSONObject)o;
//醫生認證信息
String identifyMarkStr = "";
if(doctorJo.containsKey("doctorIdentify")){
List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString());
identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList);
}

String doctorName = (String)doctorJo.get("doctorName");
String doctorTitle= (String)doctorJo.get("doctorTitle");
Object commentScore = doctorJo.get("commentScore");
String doctorSkill = (String)doctorJo.get("doctorSkill");
String allTimeHref = (String)doctorJo.get("allTimeHref");
String doctorPhoto = (String)doctorJo.get("doctorPhoto");
//醫生詳情頁加入目標採集
page.addTargetRequest(allTimeHref);
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref);
String doctorId = resultMap.getString("doctorId");

BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
bdDoctorRpc.setHospitalSourceId(hosId);
bdDoctorRpc.setDepartmentSourceId(deptId);
bdDoctorRpc.setSourceId(doctorId);
bdDoctorRpc.setName(doctorName);
bdDoctorRpc.setPracticeTitle(doctorTitle);
String commentScoreStr = commentScore==null?"":commentScore.toString();
bdDoctorRpc.setRecommendScore(commentScoreStr);
bdDoctorRpc.setDiseaseTag(doctorSkill);
bdDoctorRpc.setImageUrl(doctorPhoto);
bdDoctorRpc.setIdentifyMark(identifyMarkStr);
bdDoctorList.add(bdDoctorRpc);

JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient");
if(CollectionUtils.isNotEmpty(treatPatientArray)){
for(Object treatPatient:treatPatientArray){
JSONObject treatPatientJo = (JSONObject)treatPatient;
String diseaseName = (String)treatPatientJo.get("diseaseName");
BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc();
bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName);
bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId);
bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc);
}
}
}

page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList);
page.putField("bdDoctorList", bdDoctorList);
}
}
}
}

}else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
  String doctorId = deptResultMap.getString("doctorId");

  BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
  bdDoctorRpc.setSourceId(doctorId);

  String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString();
  bdDoctorRpc.setIntro(experience);
  List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all();
  if(CollectionUtils.isNotEmpty(commentList)){
  String recommendScore = commentList.size()>=1?commentList.get(0):"";
  String treatmentEffectScore = commentList.size()>=2?commentList.get(1):"";
  String attitudeScore = commentList.size()>=3?commentList.get(2):"";

  bdDoctorRpc.setRecommendScore(recommendScore);
  bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore);
  bdDoctorRpc.setAttitudeScore(attitudeScore);
  }

  page.putField("bdDoctorRpc", bdDoctorRpc);

}
}

@Override
public Site getSite() {
return site;
}

public static void main(String[] args) {
Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run();
}
}
  • 上述代碼採集百度醫生數據,採集線路進入醫院列表-->醫院詳情-->科室列表-->科室詳情-->醫生列表-->醫生詳情

  • 每個eles if 匹配一類頁面地址 即上面說的採集鏈路上的一個採集節點

  • 採集相應數據時會將網站的原始關係映射採集過來 ,在構建本地存儲對象時從採集鏈接中獲取採集,如醫院,醫生id值

    如果代碼
        }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){

    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");

    BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);

    String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);

  • 解析Ajax json結果

    List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());

if(CollectionUtils.isNotEmpty(doctorList)){ //收集醫生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集醫生與疾病關係信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject doctorJo = JSON.parseObject(o);

  • 針對元素特徵一樣的元素集 如li 列表 table 表格 需要依次獲取其中的內容

 

--------------

學習視頻

 

複製鏈接,在瀏覽器打開
tomcat源碼解析
https://study.163.com/course/introduction/1209535854.htm

Springmvc源碼解析
https://study.163.com/course/introduction/1209536851.htm

dubbo源碼解析
https://study.163.com/course/introduction/1209648816.htm

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章