HttpClient使用
HttpClient应用
用HttpClient爬网站时有时会遇到一种现象,就是自己写个JSP模拟表单提交可以成功,但是用HttpClient就无法成功。
原因有可能是Header或者Cookies没有设置,因为如果用游览器访问的话,这些参数都可以通过游览器而取到,不需要自己手动设置,而HttpClient则无法做到。所以我们需要对HttpClient进行设置,完整的代码如下:
public static String HttpClientCIB() { //添加header信息 List <Header> headers = new ArrayList <Header>(); headers.add(new Header("Referer", "http://wap.sududa.com/default.aspx")); headers.add(new Header("User-Agent", "http://wap.sududa.com/default.aspx")); headers.add(new Header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")); headers.add(new Header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)")); headers.add(new Header("Accept-Language", "zh-cn,zh;q=0.5")); headers.add(new Header("Host", "wap.sududa.com")); headers.add(new Header("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7")); HttpClient httpclient = new HttpClient(); httpclient.getHostConfiguration().getParams().setParameter("http.default-headers", headers); httpclient.getHostConfiguration().setProxy( "202.84.17.41",8080); httpclient.getHttpConnectionManager().getParams().setConnectionTimeout(50); String result=""; PostMethod httppost = new PostMethod("http://wap.sududa.com/wap/default.aspx"); httppost.getParams().setContentCharset("GB2312"); httppost.setRequestHeader("Connection", "close"); //添加两个Cookie信息 httpclient.getParams().setCookiePolicy(CookiePolicy.RFC_2109);//RFC_2109是支持较普遍的一个,还有其他cookie协议 HttpState initialState = new HttpState(); Cookie SUDUDA_COM_WapBalance = new Cookie(); SUDUDA_COM_WapBalance.setDomain("wap.sududa.com"); SUDUDA_COM_WapBalance.setPath("/"); SUDUDA_COM_WapBalance.setName("SUDUDA_COM_WapBalance"); SUDUDA_COM_WapBalance.setValue("0.000"); Cookie SUDUDA_COM_WapKey = new Cookie(); SUDUDA_COM_WapKey.setDomain("wap.sududa.com"); SUDUDA_COM_WapKey.setPath("/"); SUDUDA_COM_WapKey.setName("SUDUDA_COM_WapKey"); SUDUDA_COM_WapKey.setValue("56643CAF26F5A7751F1097F4B3D01AC4"); initialState.addCookie(SUDUDA_COM_WapBalance); initialState.addCookie(SUDUDA_COM_WapKey); httpclient.setState(initialState); NameValuePair[] data = { new NameValuePair("__VIEWSTATE", "/wEPDwUKMjAzOTY2ODc0NmRkkLuRhKYz6SYsEOIBQM8bHuiWt2k="), new NameValuePair(".logontest", ""), new NameValuePair("Name", "readls@163.com"), new NameValuePair("Pass","changwei"), }; httppost.setRequestBody(data); InputStream is = null; try { httpclient.executeMethod(httppost); BufferedReader bf = new BufferedReader(new InputStreamReader(httppost.getResponseBodyAsStream())); // result=httppost.getResponseBodyAsStream(); String line; StringBuffer paramter= new StringBuffer(); while ((line = bf.readLine()) != null){ // System.out.println(line); paramter=paramter.append(line); } bf.close(); result=paramter.toString(); result=result.replaceAll("\n",""); result=result.replaceAll("\r",""); result=result.replaceAll("\t",""); // System.out.println("||"+result+"||"); }catch( Exception e){ e.printStackTrace(); } finally { httppost.releaseConnection(); } return result; } 想获得该网页的Header和Cookie信息,可以通过HttpWatch工具。