Exception in thread "Thread-49801" java.lang.OutOfMemoryError: Java heap space

Oi pessoal...mais uma vez venho recorrer a vossa ajuda!!!
O programa a seguir vai-me fazer uma pesquisa em profundidade a partir da http://pt.wikipedia.org/wiki/Categoria:Medicina, e a partir ele cria o caminho de todas as subdirectorias e respectivos artigos.
Por Exemplo:
Medicina/Especialidades médicas/Psiquiatria/Saúde mental/Psicologia/Comportamento/Ciências do comportamento/Antropologia/Comportamento/Comportamento humano/Lazer/Entretenimento/Atrações turísticas/Atrações turísticas por país/Atrações turísticas do Brasil/Áreas verdes do Brasil/Áreas verdes do estado de Rondônia/Parques nacionais em Rondônia/Parque Nacional de Pacaás Novos http://pt.wikipedia.org/wiki/Parque_Nacional_de_Paca%C3%A1s_Novos

Extraiu este caminho! O que eu precisava era uma maneira que quando aparecem links repetidos, não copiasse esse caminho, pois já havia outro que desse no mesmo sitio.
Porque senão dá-me montes de caminhos e chega a um ponto que aparece Exception in thread "Thread-49801" java.lang.OutOfMemoryError: Java heap space!!!

Ajudem-me por favor...Vou passar o programa!

package Sonia;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WikiDepht {
	

	FileWriter log;

	WikiDepht() throws IOException{
		log = new FileWriter("C:\debug.txt");
		loadCategories();
		continuation = new ArrayList<MyURL>();
		spider = new Spider();
	}

	ArrayList<String> categories;
	ArrayList<MyURL> continuation;

	Spider spider;

	ArrayList<MyURL> extractCategoria(MyURL base, String HTMLDoc) throws IOException{
		Pattern oPattern = Pattern.compile("<a class=\"CategoryTreeLabel  CategoryTreeLabelNs14 CategoryTreeLabelCategory\" href=\"/wiki/Categoria:[^\">]*\">");
		Pattern cPattern = Pattern.compile("</a>");

		Matcher oMatcher = oPattern.matcher(HTMLDoc);
		Matcher cMatcher;
		ArrayList<MyURL> categoria = new ArrayList<MyURL>();

		int osp, oep, csp = -1, cep = -1; // osp - opening start pos, oep - op end pos, csp - closin start pos, cep - closing end position
		StringBuffer subDoc = new StringBuffer();
		while (oMatcher.find()) {
			osp = oMatcher.start();
			oep = oMatcher.end();
			if(osp < cep)
				continue;

			cMatcher = cPattern.matcher(HTMLDoc.substring(oep));
			if(cMatcher.find()){
				csp = cMatcher.start();
				cep = cMatcher.end();
				subDoc.append(HTMLDoc.substring(osp, oep + cep) + "\n");
				//categoria.add(HTMLDoc.substring(oep, oep + csp));
			}
		}

		spider.clear();
		spider.extractText(subDoc.toString(), base);
		categoria.addAll(spider.getURLs());
		Collections.sort(categoria, new MyURL.CompByURLID());
		return categoria;
	}

	String extractArticles(String HTMLDoc){

		//<h2>Artigos na categoria "Medicina"</h2>
		String ptrn = "<h2>Artigos na categoria ";
		int beg = HTMLDoc.indexOf(ptrn);
		if(beg < 0)
			return null;
		String subDoc = HTMLDoc.substring(beg);

		Pattern oPattern = Pattern.compile("<table width=\"100%\">");
		Pattern cPattern = Pattern.compile("</table>");

		Matcher oMatcher = oPattern.matcher(subDoc);
		Matcher cMatcher;
		String artigos = "";

		int osp, oep, csp = -1, cep = -1; // osp - opening start pos, oep - op end pos, csp - closin start pos, cep - closing end position
		while (oMatcher.find()) {
			osp = oMatcher.start();
			oep = oMatcher.end();
			if(osp < cep)
				continue;

			cMatcher = cPattern.matcher(subDoc.substring(oep));
			if(cMatcher.find()){
				csp = cMatcher.start();
				cep = cMatcher.end();
				artigos += subDoc.substring(oep, oep + csp);

			}
		}
		return artigos;
	}

	ArrayList<MyURL> subCategories = new ArrayList<MyURL>();
	ArrayList<MyURL> articles = new ArrayList<MyURL>();
	@SuppressWarnings({ "unchecked", "unchecked" })
	void processCategory() throws IOException{
		MyURL base = continuation.get(0);

		ArrayList<MyURL> categoria = new ArrayList<MyURL>();

		for(int i = 0; i < continuation.size(); i++){
			base = continuation.get(i);

			spider.clear();
			Document doc =  spider.begin(base);

			categoria.addAll(extractCategoria(base, doc.HTML));


			String tbl = extractArticles(doc.HTML);

			if(tbl != null){
				spider.clear();
				spider.extractText(tbl, base);
				ArrayList<MyURL> URLs = spider.getURLs();
				Collections.sort(URLs, new MyURL.CompByURLID());

				articles.addAll(URLs);

			}

			for(int j = 0; j < doc.links.size(); j++){
				MyURL t = (MyURL) doc.links.get(j);
				if(t.LinkText.matches("pr.ximos \d+")){
					continuation.add(t);
					break;
				}
			}
		}
		subCategories.addAll(categoria);
	}

	void iterateCategories() throws IOException{
		for (String cat : categories) {
			String folder = "http://pt.wikipedia.org/wiki/" + cat;  
			//String folder = "http://pt.wikipedia.org/wiki/" + word;

			MyURL base = new MyURL(folder, 0);
			continuation.clear();
			continuation.add(base);

			subCategories.clear();
			articles.clear();
			continuation.add(base);
			processCategory();
		}
	}

	void loadCategories() throws IOException{
		categories = new ArrayList<String>();

		FileReader fr= new FileReader(Folders.homeFolder +"Links de Categorias da Medicina.txt");
		BufferedReader br= new BufferedReader(fr);

		for (;;) {
			String line= br.readLine();
			if ( line == null ) break;
			categories.add(line);
		}
		br.close();
	}

	ArrayList<MyURL> paths = new ArrayList<MyURL>();
	void WikiInDepht(String path, MyURL base) throws IOException{
		int idx = Collections.binarySearch(paths, base, MyURL.compByURLStr);
		if(idx < 0)
			paths.add( -1 - idx, base);
		else
			return;
		base.LinkText = path;
		base.ID = paths.size();
		System.out.println(base.LinkText + "\t" + base.aURL);
		log.write(base.LinkText + "\t" + base.aURL + "\n");
		log.flush();

		subCategories.clear();
		articles.clear();
		continuation.clear();
		continuation.add(base);
		processCategory();

		for(MyURL a : articles){
			idx = Collections.binarySearch(paths, a, MyURL.compByURLStr);
			if(idx < 0)
				paths.add( -1 - idx, base);
			a.LinkText =  path + "/" + a.LinkText;
			a.ID = paths.size();
		}

		ArrayList<MyURL> localCategories = new ArrayList<MyURL>();
		localCategories.addAll(subCategories);

		for(MyURL sc: localCategories)
			WikiInDepht(path + "/" + sc.LinkText, sc);

	}

	public static void main(String[] args) throws IOException {
		WikiDepht sc = new WikiDepht();
		MyURL b = new MyURL("http://pt.wikipedia.org/wiki/Categoria:Medicina", 0);
		sc.WikiInDepht("Medicina", b);
		Collections.sort(sc.paths, MyURL.compByURLID);
		StringBuffer res = new StringBuffer();
		for(MyURL p: sc.paths)
			res.append(p + "\n");
		IO.save(res.toString(), "C:\Documents and Settings\Sonia\My Documents\Projecto\Importante\Caminho\patsh.txt");

		System.out.println("fim do programa.");
	}

}

Como vou conseguir todas as subdirectorias e artigos a partir dali?!!

Beijo grande

Tem razão victor...isso não é fácil!!!
Vou-te mostrar essas classes, para ver se vc me pode ajudar...

Class Document

package Sonia;

import java.util.ArrayList;

public class Document {

	MyURL base = null;
	String text = "";
	String HTML = "";
	ArrayList links = new ArrayList();
}

Class MyURL

package Sonia;

import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Comparator;

public class MyURL implements Serializable {
	
	private static final long serialVersionUID = 1L;

	public MyURL(String spec, int NewDepth) throws MalformedURLException{
		aURL = new URL(spec);
		LinkText = "";
		depth = NewDepth;
	}
	
	public MyURL(URL context, String spec, int NewDepth) throws MalformedURLException{
		aURL = new URL(context, spec);
		LinkText = "";
		depth = NewDepth;
	}
	
	public URL aURL;
	public int depth;
	public int ID = 0;
	public int Parent_ID = 0;
	public String LinkText = "";
	public String Title = ""; 
	public double weigh = 0;
	
	public boolean equals(Object obj){
		if(!(obj instanceof MyURL)) return super.equals(obj);
		return aURL.equals(((MyURL)obj).aURL);
	}
	
	public static class CompByURLStr implements Comparator{

		public int compare(Object o1, Object o2) {
			return ((MyURL) o1).aURL.toString().compareTo(((MyURL) o2).aURL.toString());
		}
	}
	
	public static CompByURLStr compByURLStr = new CompByURLStr();
	
	public static class CompByURLID implements Comparator{

		public int compare(Object o1, Object o2) {
			if(((MyURL) o1).ID - ((MyURL) o2).ID < 0)
				return -1;
			if(((MyURL) o1).ID - ((MyURL) o2).ID > 0)
				return +1;
			return 0;
		}
	}
	
	public static CompByURLID compByURLID = new CompByURLID();
	
}

Consegue entender assim e me ajudar?

7 Respostas

Topicos relacionados