/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.BasicDocument;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.AbstractListProcessor;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class StripTagsProcessor<L, F>
extends AbstractListProcessor<Word, Word, L, F> {
    private static final HashSet<String> BLOCKTAGS = new HashSet<String>(Arrays.asList("blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "table", "tr", "ul"));
    public static final Set<String> blockTags = BLOCKTAGS;
    private boolean markLineBreaks;

    public StripTagsProcessor() {
        this(false);
    }

    public StripTagsProcessor(boolean markLineBreaks) {
        this.setMarkLineBreaks(markLineBreaks);
    }

    public boolean getMarkLineBreaks() {
        return this.markLineBreaks;
    }

    public void setMarkLineBreaks(boolean markLineBreaks) {
        this.markLineBreaks = markLineBreaks;
    }

    @Override
    public List<Word> process(List<? extends Word> in) {
        ArrayList<Word> out2 = new ArrayList<Word>();
        boolean justInsertedNewline = false;
        for (Word word : in) {
            String ws = word.word();
            if (ws.startsWith("<") && ws.endsWith(">")) {
                int tagEndIndex;
                int tagStartIndex;
                if (!this.markLineBreaks || justInsertedNewline) continue;
                for (tagStartIndex = 1; tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex)); ++tagStartIndex) {
                }
                if (tagStartIndex == ws.length()) continue;
                for (tagEndIndex = ws.length() - 1; tagEndIndex > tagStartIndex && !Character.isLetterOrDigit(ws.charAt(tagEndIndex)); --tagEndIndex) {
                }
                String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase();
                if (!blockTags.contains(tagName)) continue;
                out2.add(new Word("\n"));
                justInsertedNewline = true;
                continue;
            }
            out2.add(word);
            justInsertedNewline = false;
        }
        return out2;
    }

    public static void main(String[] args) {
        new BasicDocument();
        BasicDocument htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
        System.out.println("Before:");
        System.out.println(htmlDoc);
        Document txtDoc = new StripTagsProcessor(true).processDocument(htmlDoc);
        System.out.println("After:");
        System.out.println(txtDoc);
        Document sentences = new WordToSentenceProcessor().processDocument(txtDoc);
        System.out.println("Sentences:");
        System.out.println(sentences);
    }
}

