public final class CONLL12Utils extends java.lang.Object implements CorefConstants
Column Type Description 1 Document ID This is a variation on the document filename 2 Part number Some files are divided into multiple parts numbered as 000, 001, 002, ... etc. 3 Word number 4 Word itself This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release. 5 Part-of-Speech 6 Parse bit This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column. 7 Predicate lemma The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-" 8 Predicate Frameset ID This is the PropBank frameset ID of the predicate in Column 7. 9 Word sense This is the word sense of the word in Column 3. 10 Speaker/Author This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data. 11 Named Entities These columns identifies the spans representing various named entities. 12:N Predicate Arguments There is one column each of predicate argument structure information for the predicate mentioned in Column 7. N Coreference Coreference chain information encoded in a parenthesis structure.
Modifier and Type | Class and Description |
---|---|
static class |
CONLL12Utils.BlockHandler |
Modifier and Type | Field and Description |
---|---|
static java.util.regex.Pattern |
BAR |
static java.lang.String |
BEGIN_DOCUMENT |
static java.util.regex.Pattern |
BLANK |
static int |
DOC_COL |
static java.lang.String |
END_DOCUMENT |
static int |
ENTITY_COL |
static int |
FORM_COL |
static int |
FRAMESET_COL |
static int |
LEMMA_COL |
static java.util.regex.Pattern |
NUM |
static int |
PARSE_COL |
static int |
PART_COL |
static int |
SENSE_COL |
static int |
SPEAKER_COL |
static int |
TAG_COL |
static int |
WORD_COL |
static java.util.regex.Pattern |
WS |
BEGIN_INDEX_KEY, CLUSTER_ID_KEY, COREFERENCE_DOCUMENT_VIEW_ID, COREFERENCE_EXPLORER_VIEW_ID, COREFERENCE_MANAGER_VIEW_ID, COREFERENCE_PERSPECTIVE_ID, COREFERENCE_PLUGIN_ID, DOCUMENT_EXPLORER_SELECTION_CHANGED, DOCUMENT_INDEX_KEX, EDGE_TYPE, END_INDEX_KEY, ERROR_ANALYSIS_VIEW_ID, MENTION_HEAD_KEY, MENTION_SIZE_KEY, MENTION_TYPE, RANGE_KEY, SENTENCE_INDEX_KEX
DATA_GROUP_LABEL, DATA_GROUP_VALUE, DATA_HEAD_ROOT, DATA_LEFT_LABEL, DATA_LEFT_VALUE, DATA_NO_VALUE, DATA_RIGHT_LABEL, DATA_RIGHT_VALUE, DATA_ROOT_LABEL, DATA_UNDEFINED_DOUBLE_VALUE, DATA_UNDEFINED_FLOAT_VALUE, DATA_UNDEFINED_LABEL, DATA_UNDEFINED_VALUE, DATA_YES_VALUE, DEPREL_KEY, DIRECTION_KEY, DISTANCE_KEY, EDGE_KEY, ENTITY_KEY, FEATURES_KEY, FLAG_PROJECTIVE, FLAGS_KEY, FORM_KEY, FRAMESET_KEY, GENDER_KEY, HEAD_KEY, ID_KEY, INDEX_KEY, LEMMA_KEY, LENGTH_KEY, NUMBER_KEY, PARSE_KEY, POS_KEY, ROOT_KEY, SENSE_KEY, SIZE_KEY, SPEAKER_FEATURES_KEY, SPEAKER_KEY, TAG_KEY, TRANSITIVE_KEY
Modifier and Type | Method and Description |
---|---|
static DefaultCoreferenceData |
readData(DocumentData document,
de.ims.icarus.util.strings.CharTableBuffer buffer) |
static DocumentData |
readDocumentData(DocumentSet documentSet,
de.ims.icarus.util.strings.CharTableBuffer buffer,
CONLL12Utils.BlockHandler blockHandler) |
public static final int DOC_COL
public static final int PART_COL
public static final int WORD_COL
public static final int FORM_COL
public static final int TAG_COL
public static final int PARSE_COL
public static final int LEMMA_COL
public static final int FRAMESET_COL
public static final int SENSE_COL
public static final int SPEAKER_COL
public static final int ENTITY_COL
public static final java.util.regex.Pattern WS
public static final java.util.regex.Pattern BLANK
public static final java.util.regex.Pattern BAR
public static final java.util.regex.Pattern NUM
public static final java.lang.String BEGIN_DOCUMENT
public static final java.lang.String END_DOCUMENT
public static DefaultCoreferenceData readData(DocumentData document, de.ims.icarus.util.strings.CharTableBuffer buffer)
public static DocumentData readDocumentData(DocumentSet documentSet, de.ims.icarus.util.strings.CharTableBuffer buffer, CONLL12Utils.BlockHandler blockHandler) throws java.io.IOException
java.io.IOException