From 7d723d8cc81e7e133cc546ace8b472d9b8f3b467 Mon Sep 17 00:00:00 2001 From: "six.nonacosa" Date: Tue, 22 May 2018 00:33:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=80=E5=A7=8B=E9=87=8D=E6=96=B0=E5=90=AF?= =?UTF-8?q?=E5=8A=A8webBee=EF=BC=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 开始重新启动webBee! --- .../test/java/example/BaiduSearchByUrl.java | 51 +++++++++++++++++++ .../src/test/java/example/MainDemoByApi.java | 4 +- .../src/test/java/example/MainDemoByUrl.java | 2 +- 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100755 webBee-core/src/test/java/example/BaiduSearchByUrl.java diff --git a/webBee-core/src/test/java/example/BaiduSearchByUrl.java b/webBee-core/src/test/java/example/BaiduSearchByUrl.java new file mode 100755 index 0000000..bb30a73 --- /dev/null +++ b/webBee-core/src/test/java/example/BaiduSearchByUrl.java @@ -0,0 +1,51 @@ +package example; /** + * Created by zhuang on 2017/3/23. + */ + + + +import org.bee.webBee.Bee; +import org.bee.webBee.linker.Page; +import org.bee.webBee.processor.PageProcessor; +import org.bee.webBee.processor.Setting; + + +/** + * 类似servlet 实现HttpServlet doGet doPost 方法的方式定义爬虫 + * data 2017-03-23 01:19 + * E-mail sis.nonacosa@gmail.com + * @author sis.nonacosa + */ + +public class BaiduSearchByUrl implements PageProcessor { + + private Setting setting; + + @Override + public void process(Page page) { + //todo page.getJson/html/string().$('textarea.content').as('content').bulid().$('#img').as('img') + //todo 期望结果: {content:[],img:[]} 一条{}多条[] 的json格式 + //todo page.nextUrl('span>ss>s') + //todo 直接获取api接口 + String json = page.getHtml().$("body").toJSONString(); +// + System.out.println(json); + } + + @Override + public Setting getSetting() { + System.out.println("This is example.MainDemoByUrl's setting function ..."); + setting = Setting.create().setStartUrl("https://www.baidu.com/s?wd=666666"); + //添加cookie,模拟登陆 也可以选择setting.addCookie(key,value)添加cookie; +// setting = setting.addHeader("Cookie","d_c0=\"AHBCTk4QowuPTs0xoWv4_K0tdVn73ZvN2EI=|1492701580\"; _zap=9632bb9d-c70d-40c1-9f1b-3bd23a1116ca; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; aliyungf_tc=AQAAAKuBthrGBQkAOiAvcSpAeDQcAU+4; r_cap_id=\"ZGE2ZDIzYWRhMDA0NDM0MDgxNzI4M2ZmN2U2ODc0ZDk=|1503375703|f27506bb96ac61f5dc4eee2e6b221be5711d82db\"; cap_id=\"MmVkMzczNzg4ZjE1NDYxMTljNTg2ODk4YjliMDdiMWY=|1503375703|6f093dae6733f3d24a38941b76b0c604d1757318\"; __utma=51854390.728883040.1503302344.1503302344.1503375703.2; __utmb=51854390.0.10.1503375703; __utmc=51854390; __utmz=51854390.1503375703.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20160504=1^3=entry_date=20170420=1; l_cap_id=\"MDRhYmNjOTI0Zjg3NDRlZjk1OWJhNDg0YWE5ZWFmZmQ=|1503375748|35ca7670e26e9aab906ea34b3f790164397aa543\"; z_c0=Mi4xWVBMREJRQUFBQUFBY0VKT1RoQ2pDeGNBQUFCaEFsVk5tajdEV1FEYnVFYVBWRmdrSFNMLURKV25LblhRQ0d2TEhn|1503375770|eabf91031457017a63bacaf49356d85485986971; unlock_ticket=\"QUpCQ19Cc2ZRZ3dYQUFBQVlRSlZUYUs0bTFrYjdxbndjamtadFJrMWFyVFFxNHRwWFctTThRPT0=|1503375770|bdb7d52666c0238036a4a7e2536c46528e294dd1\"; _xsrf=3c0570c7-fc62-47bd-a2fb-613eb2457e31"); + setting = setting.setDomain("baidu.com"); + setting = setting.setHttpMethod("GET"); + return setting; + } + + + + public static void main(String[] args) { + Bee.create(new BaiduSearchByUrl()).run(); + } +} diff --git a/webBee-core/src/test/java/example/MainDemoByApi.java b/webBee-core/src/test/java/example/MainDemoByApi.java index 8e81ab0..efb86c9 100755 --- a/webBee-core/src/test/java/example/MainDemoByApi.java +++ b/webBee-core/src/test/java/example/MainDemoByApi.java @@ -48,10 +48,10 @@ public Setting getSetting() { setting = setting.addHeader("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4"); setting = setting.addHeader("Cache-Control","max-age=0"); setting = setting.addHeader("Connection","keep-alive"); - setting = setting.addHeader("Host","www.zhihu.com"); + setting = setting.addHeader("Host","zhihu.com"); setting = setting.addHeader("Upgrade-Insecure-Requests","1"); setting = setting.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); - setting = setting.addHeader("Cookie","d_c0=\"AHBCTk4QowuPTs0xoWv4_K0tdVn73ZvN2EI=|1492701580\"; _zap=9632bb9d-c70d-40c1-9f1b-3bd23a1116ca; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; aliyungf_tc=AQAAAKuBthrGBQkAOiAvcSpAeDQcAU+4; r_cap_id=\"ZGE2ZDIzYWRhMDA0NDM0MDgxNzI4M2ZmN2U2ODc0ZDk=|1503375703|f27506bb96ac61f5dc4eee2e6b221be5711d82db\"; cap_id=\"MmVkMzczNzg4ZjE1NDYxMTljNTg2ODk4YjliMDdiMWY=|1503375703|6f093dae6733f3d24a38941b76b0c604d1757318\"; __utma=51854390.728883040.1503302344.1503302344.1503375703.2; __utmb=51854390.0.10.1503375703; __utmc=51854390; __utmz=51854390.1503375703.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20160504=1^3=entry_date=20170420=1; l_cap_id=\"MDRhYmNjOTI0Zjg3NDRlZjk1OWJhNDg0YWE5ZWFmZmQ=|1503375748|35ca7670e26e9aab906ea34b3f790164397aa543\"; z_c0=Mi4xWVBMREJRQUFBQUFBY0VKT1RoQ2pDeGNBQUFCaEFsVk5tajdEV1FEYnVFYVBWRmdrSFNMLURKV25LblhRQ0d2TEhn|1503375770|eabf91031457017a63bacaf49356d85485986971; unlock_ticket=\"QUpCQ19Cc2ZRZ3dYQUFBQVlRSlZUYUs0bTFrYjdxbndjamtadFJrMWFyVFFxNHRwWFctTThRPT0=|1503375770|bdb7d52666c0238036a4a7e2536c46528e294dd1\"; _xsrf=3c0570c7-fc62-47bd-a2fb-613eb2457e31"); + setting = setting.addHeader("Cookie","d_c0=\"AHBCTk4QowuPTs0xoWv4_K0tdVn73ZvN2EI=|1492701580\"; _zap=9632bb9d-c70d-40c1-9f1b-3bd23a1116ca; q_c1=6061d5105e7144e9986c696caa21bb08|1506316115000|1492701579000; __DAYU_PP=mvU2ZM7RnfJeqjIYBJna2911b246d4a8; __utma=155987696.350837047.1524803761.1524803761.1524803761.1; __utmz=155987696.1524803761.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; q_c1=6061d5105e7144e9986c696caa21bb08|1525321980000|1492701579000; _xsrf=5348aed3-5d4e-404b-812e-88552e539882; tgw_l7_route=56f3b730f2eb8b75242a8095a22206f8; capsion_ticket=\"2|1:0|10:1526919135|14:capsion_ticket|44:MTM3NWY5MjFkM2YwNDAzNjljMDgxODM1ZWZhMGRlYTY=|42990fb74a71d0bbb63bb01972fcf014dca09c8efaad0eb163d024817ab21a73\"; z_c0=\"2|1:0|10:1526919283|4:z_c0|92:Mi4xUC1DVUNRQUFBQUFBY0VKT1RoQ2pDeVlBQUFCZ0FsVk5jejd3V3dEU0IzVVNJWWN4LWstb2ltUjhOZ0lmdURSVkNB|036748deda83a8431bb417b446b24dbe37a41f2bec6cab489397dcbe04fc354e\""); setting = setting.setHttpMethod("GET"); //你需要解析的json数据格式 data->paging->next setting = setting.setNextUrlKeyOnResult("paging->next"); diff --git a/webBee-core/src/test/java/example/MainDemoByUrl.java b/webBee-core/src/test/java/example/MainDemoByUrl.java index 6dbfac4..e61a287 100755 --- a/webBee-core/src/test/java/example/MainDemoByUrl.java +++ b/webBee-core/src/test/java/example/MainDemoByUrl.java @@ -37,7 +37,7 @@ public Setting getSetting() { System.out.println("This is example.MainDemoByUrl's setting function ..."); setting = Setting.create().setStartUrl("http://www.ZhiHu.com/explore"); //添加cookie,模拟登陆 也可以选择setting.addCookie(key,value)添加cookie; - setting = setting.addHeader("Cookie","d_c0=\"AHBCTk4QowuPTs0xoWv4_K0tdVn73ZvN2EI=|1492701580\"; _zap=9632bb9d-c70d-40c1-9f1b-3bd23a1116ca; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; q_c1=6061d5105e7144e9986c696caa21bb08|1500789237000|1492701579000; aliyungf_tc=AQAAAKuBthrGBQkAOiAvcSpAeDQcAU+4; r_cap_id=\"ZGE2ZDIzYWRhMDA0NDM0MDgxNzI4M2ZmN2U2ODc0ZDk=|1503375703|f27506bb96ac61f5dc4eee2e6b221be5711d82db\"; cap_id=\"MmVkMzczNzg4ZjE1NDYxMTljNTg2ODk4YjliMDdiMWY=|1503375703|6f093dae6733f3d24a38941b76b0c604d1757318\"; __utma=51854390.728883040.1503302344.1503302344.1503375703.2; __utmb=51854390.0.10.1503375703; __utmc=51854390; __utmz=51854390.1503375703.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.000--|2=registration_date=20160504=1^3=entry_date=20170420=1; l_cap_id=\"MDRhYmNjOTI0Zjg3NDRlZjk1OWJhNDg0YWE5ZWFmZmQ=|1503375748|35ca7670e26e9aab906ea34b3f790164397aa543\"; z_c0=Mi4xWVBMREJRQUFBQUFBY0VKT1RoQ2pDeGNBQUFCaEFsVk5tajdEV1FEYnVFYVBWRmdrSFNMLURKV25LblhRQ0d2TEhn|1503375770|eabf91031457017a63bacaf49356d85485986971; unlock_ticket=\"QUpCQ19Cc2ZRZ3dYQUFBQVlRSlZUYUs0bTFrYjdxbndjamtadFJrMWFyVFFxNHRwWFctTThRPT0=|1503375770|bdb7d52666c0238036a4a7e2536c46528e294dd1\"; _xsrf=3c0570c7-fc62-47bd-a2fb-613eb2457e31"); + setting = setting.addHeader("Cookie","d_c0=\"AHBCTk4QowuPTs0xoWv4_K0tdVn73ZvN2EI=|1492701580\"; _zap=9632bb9d-c70d-40c1-9f1b-3bd23a1116ca; q_c1=6061d5105e7144e9986c696caa21bb08|1506316115000|1492701579000; __DAYU_PP=mvU2ZM7RnfJeqjIYBJna2911b246d4a8; __utma=155987696.350837047.1524803761.1524803761.1524803761.1; __utmz=155987696.1524803761.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; q_c1=6061d5105e7144e9986c696caa21bb08|1525321980000|1492701579000; _xsrf=5348aed3-5d4e-404b-812e-88552e539882; tgw_l7_route=56f3b730f2eb8b75242a8095a22206f8; capsion_ticket=\"2|1:0|10:1526919135|14:capsion_ticket|44:MTM3NWY5MjFkM2YwNDAzNjljMDgxODM1ZWZhMGRlYTY=|42990fb74a71d0bbb63bb01972fcf014dca09c8efaad0eb163d024817ab21a73\"; z_c0=\"2|1:0|10:1526919283|4:z_c0|92:Mi4xUC1DVUNRQUFBQUFBY0VKT1RoQ2pDeVlBQUFCZ0FsVk5jejd3V3dEU0IzVVNJWWN4LWstb2ltUjhOZ0lmdURSVkNB|036748deda83a8431bb417b446b24dbe37a41f2bec6cab489397dcbe04fc354e\""); setting = setting.setDomain("zhihu.com"); setting = setting.setHttpMethod("GET"); return setting;