Skip to content

Commit 9ee8b0c

Browse files
committed
Move the corrector earlier in the UDCoverter process. Uses the corrected trees for the structure of the UD graphs, not just the tags. Noticeably reduces the number of validator errors
1 parent 2b91ad0 commit 9ee8b0c

File tree

1 file changed

+31
-14
lines changed

1 file changed

+31
-14
lines changed

src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java

+31-14
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,11 @@ private static class TreeToSemanticGraphIterator implements Iterator<Pair<Semant
9090
private Iterator<Tree> treeIterator;
9191
private Tree currentTree; // = null;
9292

93-
public TreeToSemanticGraphIterator(Iterator<Tree> treeIterator) {
93+
private TreeTransformer corrector; // = null;
94+
95+
public TreeToSemanticGraphIterator(Iterator<Tree> treeIterator, TreeTransformer corrector) {
9496
this.treeIterator = treeIterator;
97+
this.corrector = corrector;
9598
}
9699

97100
@Override
@@ -102,6 +105,25 @@ public boolean hasNext() {
102105
@Override
103106
public Pair<SemanticGraph, SemanticGraph> next() {
104107
Tree t = treeIterator.next();
108+
if (corrector != null) {
109+
t = corrector.transformTree(t);
110+
// The corrector uses tsurgeon, with two limitations:
111+
// - adjoin nodes don't set word(), just set value()
112+
// - rearranging tags doesn't update the tag() of a leaf
113+
List<Tree> preterminals = Trees.preTerminals(t);
114+
for (Tree preterminal : preterminals) {
115+
assert preterminal.children().length == 1;
116+
Tree leaf = preterminal.children()[0];
117+
if (!(leaf.label() instanceof CoreLabel)) {
118+
throw new RuntimeException("These should all be CoreLabels!");
119+
}
120+
CoreLabel leafWord = (CoreLabel) leaf.label();
121+
if (leafWord.word() == null && leafWord.value() != null) {
122+
leafWord.setWord(leafWord.value());
123+
}
124+
leafWord.setTag(preterminal.value());
125+
}
126+
}
105127
currentTree = t;
106128
return new Pair<>(convertTreeToBasic(t), null);
107129
}
@@ -246,10 +268,15 @@ public static void main(String[] args) {
246268
Iterator<Pair<SemanticGraph, SemanticGraph>> sgIterator; // = null;
247269

248270
if (treeFileName != null) {
249-
MemoryTreebank tb = new MemoryTreebank(new NPTmpRetainingTreeNormalizer(0, false, 1, false, true));
271+
NPTmpRetainingTreeNormalizer normalizer = new NPTmpRetainingTreeNormalizer(0, false, 1, false, true);
272+
MemoryTreebank tb = new MemoryTreebank(normalizer);
250273
tb.loadPath(treeFileName);
251274
Iterator<Tree> treeIterator = tb.iterator();
252-
sgIterator = new TreeToSemanticGraphIterator(treeIterator);
275+
TreeTransformer ptbCorrector = null;
276+
if (correctPTB) {
277+
ptbCorrector = new CompositeTreeTransformer(new EnglishPTBTreebankCorrector(), normalizer);
278+
}
279+
sgIterator = new TreeToSemanticGraphIterator(treeIterator, ptbCorrector);
253280
} else if (conlluFileName != null) {
254281
CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
255282
try {
@@ -274,27 +301,17 @@ public static void main(String[] args) {
274301

275302
UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null;
276303
EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null;
277-
EnglishPTBTreebankCorrector ptbCorrector = (correctPTB) ? new EnglishPTBTreebankCorrector() : null;
278304

279305
CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();
280306

281307
int graphIdx = 0;
282308
while (sgIterator.hasNext()) {
283-
Pair<SemanticGraph, SemanticGraph> sgs = sgIterator.next();
309+
final Pair<SemanticGraph, SemanticGraph> sgs = sgIterator.next();
284310
SemanticGraph sg = sgs.first();
285311

286312
if (treeFileName != null) {
287313
//add UPOS tags
288314
Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree();
289-
if (ptbCorrector != null) {
290-
tree = ptbCorrector.transformTree(tree);
291-
List<Label> xposLabels = tree.preTerminalYield();
292-
for (IndexedWord token: sg.vertexListSorted()) {
293-
int idx = token.index() - 1;
294-
String xposTag = xposLabels.get(idx).value();
295-
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, xposTag);
296-
}
297-
}
298315
Tree uposTree = UniversalPOSMapper.mapTree(tree);
299316
List<Label> uposLabels = uposTree.preTerminalYield();
300317
for (IndexedWord token: sg.vertexListSorted()) {

0 commit comments

Comments
 (0)