march alex's blog
hello,I am march alex
posts - 52,comments - 7,trackbacks - 0
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

public class GoogleFinderNew {
    
    
private static String address = "https://www.google.com.hk/search?hl=en&q=";
    
private static String query = "";
    
private static String charset = "UTF-8";
    
    
private static List<String> useragentList = new ArrayList<String>();
    
    
private static void initUserAgentList(String filename) throws IOException {
        BufferedReader reader 
= new BufferedReader(new FileReader(filename));  
        String line 
= null;
        
while((line = reader.readLine()) != null){
            useragentList.add(line.trim());
        }
        reader.close();
        
return;
    }
    
    
private static List<String> getpages(URL url) throws IOException {
        HttpURLConnection urlConnection 
= (HttpURLConnection) url.openConnection();
        BufferedReader reader 
= new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
        String line;
        String ans 
= "";
        
while ((line = reader.readLine()) != null){
             ans 
+= line + "\n";
        }
        
int st = -1, ed = 0;
        List
<String> pagesList = new ArrayList<String>();
        
        
while((st = ans.indexOf("<h3 class=\"r\"><a href=\"", ed)) != -1) {
            ed = ans.indexOf("\"", st+23);
            //System.out.println(ans.substring(st+23, ed));
            pagesList.add(ans.substring(st+23, ed));
        }
        
return pagesList;
    }
    
    
public static void main(String[] args) throws MalformedURLException, IOException, InterruptedException {
        
        Scanner in 
= new Scanner(System.in);
        String askurl 
= in.nextLine();
        query 
= in.nextLine();
        
        initUserAgentList(
"D:\\useragent.txt");
        
        
//System.setProperty("http.agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.9 Safari/525.19");
        
        
int pages = 10;
        
        
for(int i=0;i<pages;i++) {
            System.out.println((i
+1+ " ..");
            
            
int index = (int)(useragentList.size()*Math.random());
            
if(index == useragentList.size()) index --;
            String theUserAgent 
= useragentList.get(index);
            
            System.setProperty(
"http.agent", theUserAgent);
            
            String urlString 
= address + URLEncoder.encode(query, charset);
            
if(i != 0) urlString += "&start=" + i + "0";
            
//System.out.println(urlString);
            List<String> list = getpages(new URL(urlString));
            
for(String page : list) {
                
if(page.contains(askurl)) {
//                if(page.equals(askurl)) {
                    System.out.println(askurl + " found in the " + (i+1+ " th page.");
                    System.out.println(page);
                    
return;
                }
            }
            
int extraTime = 0//(int)(3000 * Math.random());
            Thread.sleep(1000 + extraTime);
        }
        
        System.out.println(
"can't find " + askurl + " in the first " + pages + " pages.");
    }
}
posted on 2015-07-29 16:41 marchalex 阅读(262) 评论(0)  编辑  收藏 所属分类: java小程序

只有注册用户登录后才能发表评论。


网站导航: