使用正则进行客户端向service捕捉html数据

一般是使用json与服务器端交互的,当是如果service无法提供json时,我们通常使用html的解析api或者直接使用正则匹配

我们要使用java实现上面的搜索引擎

创建Bean,装Book的图片链接,名字,二级url

package com.org;

public class Book {
	private String href;
	private String title;
	private String src;

	public String getHref() {
		return href;
	}

	public void setHref(String href) {
		this.href = href;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getSrc() {
		return src;
	}

	public void setSrc(String src) {
		this.src = src;
	}

	@Override
	public String toString() {
		return "Book [href=" + href + ",title=" + title + ",src=" + src + "]";
	}

}

http获取整个html

package com.org;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

public class UtilNet {

	public static StringBuffer getContent(String url,String body){
		URL http = null;
		HttpURLConnection  conn = null;
		InputStream in = null;
		OutputStream out = null;
		PrintStream outPs = null;
		BufferedReader bufferIn = null;
		try {
			http = new URL(url);
		    conn = (HttpURLConnection) http.openConnection();
		    
		    conn.setRequestMethod("GET");	
		    conn.setRequestProperty("Host","it-ebooks.info");
		    conn.setRequestProperty("Connection","keep-alive");
		    conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
		    conn.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0");
		    conn.setRequestProperty("Referer",url);
		  //conn.setRequestProperty("Accept-Encoding","gzip,deflate,sdch");这是进行压缩传输,浏览器会自动解压,而我们使用api所以不用该头文件,不然会乱码
		    conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
		    
		    conn.setDoInput(true);
		    conn.setDoOutput(true);
		    conn.connect();
		    
		    out   = conn.getOutputStream();
		    outPs = new PrintStream(out);
		    //数据查询
		    outPs.print(body);
		    in  = conn.getInputStream();
		    bufferIn = new BufferedReader(new InputStreamReader(in,"utf8"));

		    StringBuffer data2 = new StringBuffer();
		    String line = null;
		    
		    //读取数据
		    while((line = bufferIn.readLine())!=null){
		    	data2.append(line + "\n");
		    }
		    
		    out.close();
		    outPs.close();
		    in.close();
		    conn.disconnect();
		    return data2;
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
}

正则解析html,获取需要的数据

package com.org;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UtilJson {
	public static List<Book> paraseHtml(String ct,String reg){
		List<Book> books = new ArrayList<Book>();
		Book bk = null;
		String line = null;
		//行
		Pattern patternLine = Pattern.compile(reg,Pattern.CASE_INSENSITIVE);
		Matcher matcherLine = patternLine.matcher(ct);
		//href
		Pattern patternHref = Pattern.compile("/book/[0-9]*",Pattern.CASE_INSENSITIVE);
		//title
		Pattern patternTitle = Pattern.compile("'[a-zA-Z\\s]*'",Pattern.CASE_INSENSITIVE);
		//src
		Pattern patternSrc = Pattern.compile("/images/.*.jpg",Pattern.CASE_INSENSITIVE);
		while(matcherLine.find()){
			bk = new Book();
			line = matcherLine.group(0);
			//href title src
			Matcher matcherHref = patternHref.matcher(line);
			Matcher matcherTitle = patternTitle.matcher(line);
			Matcher matcherSrc = patternSrc.matcher(line);
			
			if(matcherHref.find()&&matcherTitle.find()&&matcherSrc.find()){
				bk.setHref(matcherHref.group(0));	
				bk.setTitle(matcherTitle.group(0).replace("'",""));
				bk.setSrc(matcherSrc.group(0));
			}
			books.add(bk);
		}
		return books;
	}
}

package com.org;

import java.io.UnsupportedEncodingException;
import java.util.List;

import org.junit.Test;

public class NetTest {
	@Test
	public void getHtml() throws UnsupportedEncodingException{
		
		StringBuffer buffer = UtilNet.getContent("http://it-ebooks.info/search/?q=java&type=title","");
		List<Book> books = UtilJson.paraseHtml(buffer.toString(),"<a\\shref=\"/book/[0-9]*/\"\\stitle='.*><img\\s.*</a>");
		for(Book bk:books){
			System.out.println(bk.toString());
		}
	}
}

效果

点击打开链接

使用正则进行客户端向service捕捉html数据

相关文章