Skip to content

Commit 2341d33

Browse files
committedNov 28, 2024·
Don't be SpaceAfter=No annotations on words which are at the start or middle of an MWT
1 parent 6f6eb93 commit 2341d33

File tree

1 file changed

+30
-10
lines changed

1 file changed

+30
-10
lines changed
 

‎src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

+30-10
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,20 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
101101

102102
// don't use after() directly; it returns a default of ""
103103
if (token.get(CoreAnnotations.AfterAnnotation.class) != null && token.after().equals("")) {
104-
if (misc.equals("_")) {
105-
misc = "SpaceAfter=No";
106-
} else {
107-
misc = misc + "|SpaceAfter=No";
104+
IndexedWord nextVertex = tokenSg.getNodeByIndex(token.index() + 1);
105+
// the next word needs to exist and be part of the same MWT
106+
// and either this word is the start of the MWT
107+
// or this word is the middle of the same MWT as the next word
108+
// if that is true, we will skip the SpaceAfter annotation
109+
boolean inMWT = ((nextVertex != null && isMWTbutNotStart(nextVertex)) &&
110+
((token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) ||
111+
(isMWTbutNotStart(token))));
112+
if (!inMWT) {
113+
if (misc.equals("_")) {
114+
misc = "SpaceAfter=No";
115+
} else {
116+
misc = misc + "|SpaceAfter=No";
117+
}
108118
}
109119
}
110120

@@ -151,19 +161,29 @@ public static void printSpan(StringBuilder sb, AbstractCoreLabel token) {
151161
}
152162
}
153163

164+
/**
165+
* Is the word part of an MWT, but not the start?
166+
*/
167+
public static boolean isMWTbutNotStart(IndexedWord nextVertex) {
168+
if (nextVertex.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) &&
169+
nextVertex.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
170+
return false;
171+
}
172+
if (!nextVertex.containsKey(CoreAnnotations.IsMultiWordTokenAnnotation.class) ||
173+
!nextVertex.get(CoreAnnotations.IsMultiWordTokenAnnotation.class)) {
174+
return false;
175+
}
176+
return true;
177+
}
178+
154179
public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord token) {
155180
int startIndex = token.index();
156181
int endIndex = startIndex;
157182
// advance endIndex until we reach the end of the sentence, the start of the next MWT,
158183
// or a word which isn't part of any MWT
159184
IndexedWord nextVertex;
160185
while ((nextVertex = graph.getNodeByIndex(endIndex+1)) != null) {
161-
if (nextVertex.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) &&
162-
nextVertex.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
163-
break;
164-
}
165-
if (!nextVertex.containsKey(CoreAnnotations.IsMultiWordTokenAnnotation.class) ||
166-
!nextVertex.get(CoreAnnotations.IsMultiWordTokenAnnotation.class)) {
186+
if (!isMWTbutNotStart(nextVertex)) {
167187
break;
168188
}
169189
++endIndex;

0 commit comments

Comments
 (0)
Please sign in to comment.