A simple html tidy
Written by Li   

A simple html tidy program with Java Stack. It can recognize the Error tags and only start tag, unmatched tag. If we need a complete html tidy, we need a html tag and attributes dictionary. A better way to do this work is to use HTML parser.

import java.util.Iterator;

public class TagIterator implements Iterator {

static class Tag {
enum Type {
Complete, Start, END, Comments,Error,OnlyStart
};

Type type;

String all;

String name="";

int start;

int end;

public String toString() {
String t = null;
if (type == Type.Complete) {
t = "complete";
} else if (type == Type.Start) {
t = "Start";
} else if (type == Type.END) {
t = "end";
} else if(type==Type.Comments) {
t = "Comments";
}else if(type==Type.Error){
t="Error";
}else if(type==Type.OnlyStart){
t="OnlyStart";
}
return "Tag[type=" + t + ";name=" + name + ";start=" + start
+ ";end=" + end + ";all=" + all + "]";
}
}

private String src = null;

private int index = 0;

private int length = 0;

private Tag tag;

private static char LEFT = '<';

private static char RIGHT = '>';

private static char END = '/';

private static String COMMENTS = " next && next != -1) {
System.err.println("Error!" + src.substring(start, next+1));
tag.type=Tag.Type.Error;

}
if (end != -1) {
String tagInfo = src.substring(start, end + 1).trim();
tag.all = tagInfo;
if (tagInfo.startsWith(COMMENTS)) {
tag.type = Tag.Type.Comments;

} else if (tagInfo.charAt(1) == END) {
tag.type = Tag.Type.END;
tag.name = tagInfo.substring(2, tagInfo.length() - 1);
} else if (tagInfo.charAt(tagInfo.length() - 2) == END) {
tag.type = Tag.Type.Complete;
int i = tagInfo.indexOf(' ');
if (i != -1) {
tag.name = tagInfo.substring(2, i);
} else {
tag.name = tagInfo.substring(2, tagInfo.length() - 2);
}
} else {
tag.type = Tag.Type.Start;
int i = tagInfo.indexOf(' ');
if (i != -1) {
tag.name = tagInfo.substring(1, i);
} else {
tag.name = tagInfo.substring(1, tagInfo.length() - 1);
}
}
this.index = end + 1;


} else {
System.err.println("error:only start:" + src.substring(start));
tag.type=Tag.Type.OnlyStart;
this.index=start+1;
}
return true;

}

public Tag next() {
if (tag.name != null) {
tag.name = tag.name.toLowerCase().trim();
}
return tag;
}

public void remove() {

}

}
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.regex.Pattern;

import base.helper.TagIterator.Tag;
import base.hibernate.Content;
import base.hibernate.ContentManager;

public class TidyHtml {

private int badFormat = 0;

public String handleUnMatcher(String src) {
Stack startTags = new Stack();
Stack endTags = new Stack();
List onlyStartTags = new ArrayList();
TagIterator it = new TagIterator(src);
while (it.hasNext()) {
Tag tag = it.next();

if (tag.name.equals("br") || tag.name.startsWith(" unMatchedTags = new ArrayList();
if (!startTags.isEmpty() || !endTags.isEmpty()) {
for (Tag t : endTags) {
String endTagName = t.name;
int size = startTags.size();
boolean isMatched = false;
for (int i = size - 1; i >= 0; i--) {
Tag startTag = startTags.get(i);
if (endTagName.equals(startTag.name)) {
isMatched = true;
startTags.remove(i);
break;
}
}
if (!isMatched) {
unMatchedTags.add(t);
}
}
unMatchedTags.addAll(startTags);
}
Collections.sort(unMatchedTags, new Comparator() {
public int compare(Tag t1, Tag t2) {
return t1.start - t2.start;
}
});
Iterator iterator = unMatchedTags.iterator();
String[] ignoreTags = { "p", "li", "ul", "pre", "a" };
int alreadyRemovedOffset = 0;
while (iterator.hasNext()) {
Tag tag = iterator.next();
for (String ignore : ignoreTags) {
if (ignore.equals(tag.name)) {
src = src.substring(0, tag.start - alreadyRemovedOffset)
+ src.substring(tag.end + 1 - alreadyRemovedOffset);
alreadyRemovedOffset += tag.all.length();
iterator.remove();
break;
}
}
}

if (unMatchedTags.size() > 1) {
System.out.println(unMatchedTags);
badFormat++;
}
if (!onlyStartTags.isEmpty()) {
Tag t = onlyStartTags.get(0);
src = src.substring(0, t.start - alreadyRemovedOffset);

}
return src;
}

}



Tags: html tidy  
 
Recommend Read