fatcodeguy
Programmer
Hi, I'm trying to parse an html file for the META tags and get the name and content attributes.
Here's the code i use
It doesn't work for META or FORM (or, I think, for any empty tags), but it does for <A href= ...> and <b>...</b>.
Any suggestions?
Here's the Metatag class
Here's the code i use
Code:
/*
Class: Test.java
Purpose:
Author:
*/
import java.io.*;
import java.lang.*;
import java.util.*;
//html parse imports
import javax.swing.text.html.*;
import javax.swing.text.*;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.SimpleAttributeSet;
public class Test{
//main function
public static void main (String args[]) throws IOException{
String filePath = "V:\\sys\\co_e.shtml";
Vector metatags = getMetatags(filePath);
System.out.println("Size: "+metatags.size());
for (int ctr=0;ctr<metatags.size();ctr++)
{
Metatag tag = (Metatag)metatags.elementAt(ctr);
System.out.println("Name: "+tag.getName());
System.out.println("Content: "+tag.getContent());
System.out.println("--------------------");
}
}//end main
public static Vector getMetatags(String filePath) {
Vector metatags = new Vector();
Metatag tag = null;
try
{
// Create a reader on the HTML content
Reader reader = new FileReader(filePath);
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument)kit.createDefaultDocument();
kit.read(reader, htmlDoc, 0);
// Find all the META elements in the HTML document
HTMLDocument.Iterator docIterator = htmlDoc.getIterator(HTML.Tag.META);
System.out.println(docIterator.isValid());
while (docIterator.isValid())
{
System.out.println("got here");
AttributeSet sas = docIterator.getAttributes();
String name = (String)sas.getAttribute(HTML.Attribute.NAME);
String content = (String)sas.getAttribute(HTML.Attribute.CONTENT);
if (name != null && content != null)
{
tag = new Metatag(name,content);
metatags.add(tag);
}
docIterator.next();
}
}
catch (BadLocationException e) {e.printStackTrace(System.err);}
catch (IOException e) {e.printStackTrace(System.err);}
// Return all found links
return metatags;
}
}//end class
It doesn't work for META or FORM (or, I think, for any empty tags), but it does for <A href= ...> and <b>...</b>.
Any suggestions?
Here's the Metatag class
Code:
import java.io.*;
import java.lang.*;
public class Metatag{
//class variables
private String name,content;
//default constructor
public Metatag(){
name = new String();
content = new String();
}
//additional constructor
public Metatag(String name,String content){
this.name=name;
this.content=content;
}
//METHODS
//accessor methods
public String getName(){return name;}
public String getContent(){return content;}
//modifier methods
public void setName(String newName){name=newName;}
public void setContent(String newContent){content=newContent;}
//other methods
public String toString(){ return "<meta name=\""+name+"\" content=\""+content+"\">";}
}//end class