Skip to content

Commit 98c0b7d

Browse files
committed
Add an option to skip the MWT in a conllu file when training a tagger
1 parent e87f437 commit 98c0b7d

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

src/edu/stanford/nlp/tagger/io/TSVTaggedFileReader.java

+8-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ public class TSVTaggedFileReader implements TaggedFileReader {
1616
private final String filename;
1717
private final int wordColumn, tagColumn;
1818
private final boolean usesComments;
19+
// if in a conllu file from UD used directly, might want to skip MWT
20+
private final boolean skipMWT;
1921
private List<TaggedWord> next; // = null;
2022
private int linesRead; // = 0;
2123

@@ -36,6 +38,7 @@ public TSVTaggedFileReader(TaggedFileRecord record) {
3638
tagColumn = ((record.tagColumn == null) ?
3739
DEFAULT_TAG_COLUMN : record.tagColumn);
3840
usesComments = record.usesComments;
41+
skipMWT = record.skipMWT;
3942
primeNext();
4043
}
4144

@@ -85,9 +88,11 @@ private void primeNext() {
8588
throw new IllegalArgumentException("File " + filename + " line #" +
8689
linesRead + " too short");
8790
}
88-
String word = pieces[wordColumn];
89-
String tag = pieces[tagColumn];
90-
next.add(new TaggedWord(word, tag));
91+
if (!(skipMWT && pieces[0].matches("[0-9]+-[0-9]+"))) {
92+
String word = pieces[wordColumn];
93+
String tag = pieces[tagColumn];
94+
next.add(new TaggedWord(word, tag));
95+
}
9196
}
9297
try {
9398
line = reader.readLine();

src/edu/stanford/nlp/tagger/io/TaggedFileRecord.java

+12-3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ public enum Format {
3838
final Integer tagColumn;
3939
final TreeReaderFactory trf;
4040
final boolean usesComments;
41+
final boolean skipMWT;
4142

4243
private TaggedFileRecord(String file, Format format,
4344
String encoding, String tagSeparator,
@@ -47,7 +48,7 @@ private TaggedFileRecord(String file, Format format,
4748
NumberRangesFileFilter treeRange,
4849
Predicate<Tree> treeFilter,
4950
Integer wordColumn, Integer tagColumn,
50-
boolean usesComments) {
51+
boolean usesComments, boolean skipMWT) {
5152
this.file = file;
5253
this.format = format;
5354
this.encoding = encoding;
@@ -60,6 +61,7 @@ private TaggedFileRecord(String file, Format format,
6061
this.tagColumn = tagColumn;
6162
this.trf = trf;
6263
this.usesComments = usesComments;
64+
this.skipMWT = skipMWT;
6365
}
6466

6567
public static final String FORMAT = "format";
@@ -73,6 +75,7 @@ private TaggedFileRecord(String file, Format format,
7375
public static final String TAG_COLUMN = "tagColumn";
7476
public static final String TREE_READER = "trf";
7577
public static final String COMMENTS = "comments";
78+
public static final String SKIP_MWT = "skipMWT";
7679

7780
public String toString() {
7881
StringBuilder s = new StringBuilder();
@@ -107,6 +110,9 @@ public String toString() {
107110
if (usesComments) {
108111
s.append("," + COMMENTS + "=true");
109112
}
113+
if (skipMWT) {
114+
s.append("," + SKIP_MWT + "=true");
115+
}
110116
return s.toString();
111117
}
112118

@@ -142,7 +148,7 @@ public static TaggedFileRecord createRecord(Properties config,
142148
return new TaggedFileRecord(description, Format.TEXT,
143149
getEncoding(config),
144150
getTagSeparator(config),
145-
null, null, null, null, null, null, null, false);
151+
null, null, null, null, null, null, null, false, false);
146152
}
147153

148154
String[] args = new String[pieces.length - 1];
@@ -158,6 +164,7 @@ public static TaggedFileRecord createRecord(Properties config,
158164
Predicate<Tree> treeFilter = null;
159165
Integer wordColumn = null, tagColumn = null;
160166
boolean comments = false;
167+
boolean skipMWT = false;
161168

162169
for (String arg : args) {
163170
String[] argPieces = arg.split("=", 2);
@@ -188,14 +195,16 @@ public static TaggedFileRecord createRecord(Properties config,
188195
tagColumn = Integer.valueOf(argPieces[1]);
189196
} else if (argPieces[0].equalsIgnoreCase(COMMENTS)) {
190197
comments = Boolean.valueOf(argPieces[1]);
198+
} else if (argPieces[0].equalsIgnoreCase(SKIP_MWT)) {
199+
skipMWT = Boolean.valueOf(argPieces[1]);
191200
} else {
192201
throw new IllegalArgumentException("TaggedFileRecord argument " +
193202
argPieces[0] + " is unknown");
194203
}
195204
}
196205
return new TaggedFileRecord(file, format, encoding, tagSeparator,
197206
treeTransformer, treeNormalizer, trf, treeRange,
198-
treeFilter, wordColumn, tagColumn, comments);
207+
treeFilter, wordColumn, tagColumn, comments, skipMWT);
199208
}
200209

201210
public static String getEncoding(Properties config) {

0 commit comments

Comments
 (0)