Alguem me consegue explicar porque é que não me retorna nada? penas diz que foi corrido com sucesso, mas sem qualquer retorno! Não vejo nada errado.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package googlesearchdemo;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.Document;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author filip
*/
public class GoogleSearchDemo {
// pattern for extracting the link such as www.codeforeach.com/java/ ( domain
// name + path )
private static final Pattern p = Pattern
.compile("([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}(/[^&]*)*");
public static void main(String[] args) throws IOException {
String searchQuery = "animais";
List<String> links = searchGoogle(searchQuery);
for (String link : links) {
System.out.println(link);
}
}
public static List<String> searchGoogle(String searchQuery) throws IOException {
List<String> result = new ArrayList<>();
// lets get the top results counting to nearly 15
String request = "https://www.google.com/search?q=" + searchQuery + "&num=15";
org.jsoup.nodes.Document doc = Jsoup.connect(request)
.userAgent("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)").get();
// get the required content from response . Here ( h3 a ) is the selector
// pattern for selecting all heading links
Elements links = doc.select("h3 a[href]");
for (Element link : links) {
String hrefValue = link.attr("href");
if (hrefValue.startsWith("/url?q="))
result.add(extractLink(hrefValue));
}
return result;
}
// extract required link from href value
private static String extractLink(String href) {
String result = null;
Matcher m = p.matcher(href);
if (m.find()) {
result = m.group();
}
return result;
Debuguei e deu o seguinte erro: Not able to submit breakpoint MethodBreakpoint [teste_email.Teste_email$1].getPasswordAuthentication '()Ljava/net/PasswordAuthentication;', reason: Breakpoint belongs to disabled source root 'C:\Users\filip\Documents\NetBeansProjects\teste_email\src'. See Window/Debugging/Sources.
Olha, verifiquei o programa e somente encontrei um erro minimo na parser da pagina da google. Bom, Eu tambem coloquei um decode pra ver se o resultado estava correto quando colocasse o endereço no navegador. Funcionou corretamente.
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.Document;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GoogleSearchDemo {
// pattern for extracting the link such as www.codeforeach.com/java/ ( domain
// name + path )
private static final Pattern p = Pattern
.compile("([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}(/[^&]*)*");
public static void main(String[] args) throws IOException {
String searchQuery = "animais";
List<String> links = searchGoogle(searchQuery);
for (String link : links) {
System.out.println(link);
}
}
public static List<String> searchGoogle(String searchQuery) throws IOException {
List<String> result = new ArrayList<>();
// lets get the top results counting to nearly 15
String request = "https://www.google.com/search?q=" + searchQuery + "&num=15";
org.jsoup.nodes.Document doc = Jsoup.connect(request)
.userAgent("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)").get();
// get the required content from response . Here ( h3 a ) is the selector
// pattern for selecting all heading links
//% System.out.println( "--> \n" + doc.toString() );
//% Elements links = doc.select("h3 a[href]");
Elements links = doc.select("a[href]");
for (Element link : links) {
String hrefValue = link.attr("href");
if (hrefValue.startsWith("/url?q=")) {
try {
hrefValue = URLDecoder.decode(hrefValue, StandardCharsets.UTF_8.toString());
result.add(extractLink(hrefValue));
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause());
}
}
}
return result;
}
// extract required link from href value
private static String extractLink(String href) {
String result = null;
Matcher m = p.matcher(href);
if (m.find()) {
result = m.group();
}
return result;
}
}
Agora dê uma analisada na fonte e tambem na documentação da jsoup.
Vou postar o fonte com essa alteração pra você analisar melhor e fazer as alterações que te interessam.
E tambem destaco que existe outra forma de fazer isso, que é atraves da raspagem usando WebEngine e WebView, que podem ficar ocultos se necessário e usar uma função em javascript sendo chamada atraves da WebEngine.
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javafx.util.Pair;
import javax.swing.text.Document;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GoogleSearchDemo {
// pattern for extracting the link such as www.codeforeach.com/java/ ( domain
// name + path )
private static final Pattern p = Pattern
.compile("([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}(/[^&]*)*");
public static void main(String[] args) throws IOException {
String searchQuery = "animais";
List<Pair> links = searchGoogle(searchQuery);
for (Pair link : links) {
System.out.println("link=" + link.getKey() + " url=" + link.getValue() );
}
}
public static List<Pair> searchGoogle(String searchQuery) throws IOException {
List<Pair> result = new ArrayList<>();
// lets get the top results counting to nearly 15
String request = "https://www.google.com/search?q=" + searchQuery + "&num=15";
org.jsoup.nodes.Document doc = Jsoup.connect(request)
.userAgent("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)").get();
// get the required content from response . Here ( h3 a ) is the selector
// pattern for selecting all heading links
// System.out.println( "--> \n" + doc.toString() );
Elements links = doc.select(".kCrYT");
for (Element link : links) {
Elements el_a = link.select("a");
String hrefValue = el_a.attr("href");
Elements el_divs = el_a.select("div");
String nome = "";
if(el_divs.size() > 0) {
nome = el_divs.get(0).html();
}
if (hrefValue.startsWith("/url?q=")) {
try {
String slink = extractLink(hrefValue);
if( slink != null ) {
hrefValue = URLDecoder.decode(slink, StandardCharsets.UTF_8.toString());
Pair pair = new Pair(nome, hrefValue );
result.add( pair );
}
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause());
} catch(java.lang.IndexOutOfBoundsException ie) {
ie.printStackTrace();
// nao faca nada
}
}
}
return result;
}
// extract required link from href value
private static String extractLink(String href) {
String result = null;
Matcher m = p.matcher(href);
if (m.find()) {
result = m.group();
}
return result;
}
}