|
A simple html tidy program with Java Stack. It can recognize the Error tags and only start tag, unmatched tag. If we need a complete html tidy, we need a html tag and attributes dictionary. A better way to do this work is to use HTML parser.
import java.util.Iterator;
public class TagIterator implements Iterator {
static class Tag { enum Type { Complete, Start, END, Comments,Error,OnlyStart };
Type type;
String all;
String name="";
int start;
int end;
public String toString() { String t = null; if (type == Type.Complete) { t = "complete"; } else if (type == Type.Start) { t = "Start"; } else if (type == Type.END) { t = "end"; } else if(type==Type.Comments) { t = "Comments"; }else if(type==Type.Error){ t="Error"; }else if(type==Type.OnlyStart){ t="OnlyStart"; } return "Tag[type=" + t + ";name=" + name + ";start=" + start + ";end=" + end + ";all=" + all + "]"; } }
private String src = null;
private int index = 0;
private int length = 0;
private Tag tag;
private static char LEFT = '<';
private static char RIGHT = '>';
private static char END = '/';
private static String COMMENTS = " next && next != -1) { System.err.println("Error!" + src.substring(start, next+1)); tag.type=Tag.Type.Error; } if (end != -1) { String tagInfo = src.substring(start, end + 1).trim(); tag.all = tagInfo; if (tagInfo.startsWith(COMMENTS)) { tag.type = Tag.Type.Comments;
} else if (tagInfo.charAt(1) == END) { tag.type = Tag.Type.END; tag.name = tagInfo.substring(2, tagInfo.length() - 1); } else if (tagInfo.charAt(tagInfo.length() - 2) == END) { tag.type = Tag.Type.Complete; int i = tagInfo.indexOf(' '); if (i != -1) { tag.name = tagInfo.substring(2, i); } else { tag.name = tagInfo.substring(2, tagInfo.length() - 2); } } else { tag.type = Tag.Type.Start; int i = tagInfo.indexOf(' '); if (i != -1) { tag.name = tagInfo.substring(1, i); } else { tag.name = tagInfo.substring(1, tagInfo.length() - 1); } } this.index = end + 1;
} else { System.err.println("error:only start:" + src.substring(start)); tag.type=Tag.Type.OnlyStart; this.index=start+1; } return true;
}
public Tag next() { if (tag.name != null) { tag.name = tag.name.toLowerCase().trim(); } return tag; }
public void remove() {
}
}
import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Stack; import java.util.regex.Pattern;
import base.helper.TagIterator.Tag; import base.hibernate.Content; import base.hibernate.ContentManager;
public class TidyHtml {
private int badFormat = 0;
public String handleUnMatcher(String src) { Stack startTags = new Stack(); Stack endTags = new Stack(); List onlyStartTags = new ArrayList(); TagIterator it = new TagIterator(src); while (it.hasNext()) { Tag tag = it.next();
if (tag.name.equals("br") || tag.name.startsWith(" unMatchedTags = new ArrayList(); if (!startTags.isEmpty() || !endTags.isEmpty()) { for (Tag t : endTags) { String endTagName = t.name; int size = startTags.size(); boolean isMatched = false; for (int i = size - 1; i >= 0; i--) { Tag startTag = startTags.get(i); if (endTagName.equals(startTag.name)) { isMatched = true; startTags.remove(i); break; } } if (!isMatched) { unMatchedTags.add(t); } } unMatchedTags.addAll(startTags); } Collections.sort(unMatchedTags, new Comparator() { public int compare(Tag t1, Tag t2) { return t1.start - t2.start; } }); Iterator iterator = unMatchedTags.iterator(); String[] ignoreTags = { "p", "li", "ul", "pre", "a" }; int alreadyRemovedOffset = 0; while (iterator.hasNext()) { Tag tag = iterator.next(); for (String ignore : ignoreTags) { if (ignore.equals(tag.name)) { src = src.substring(0, tag.start - alreadyRemovedOffset) + src.substring(tag.end + 1 - alreadyRemovedOffset); alreadyRemovedOffset += tag.all.length(); iterator.remove(); break; } } }
if (unMatchedTags.size() > 1) { System.out.println(unMatchedTags); badFormat++; } if (!onlyStartTags.isEmpty()) { Tag t = onlyStartTags.get(0); src = src.substring(0, t.start - alreadyRemovedOffset);
} return src; }
}
|