Skip to content

Commit 0e39b37

Browse files
committed
Add the ability to mark newly created nodes with names in the SemgrexMatcher, allowing for a compound operation which then assigns more fields to that node
1 parent 13ede5a commit 0e39b37

File tree

3 files changed

+84
-3
lines changed

3 files changed

+84
-3
lines changed

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java

+45-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,21 @@
1717
* stuck to each of the words. We can separate that out by using two
1818
* regex, one which matches the " in a group, one which matches the
1919
* rest of the word without the "
20+
* <br>
21+
* Aside from the text and the dependency, the new node is rather bare bones.
22+
* Adding the -name argument allows for specifying a comma-separate list
23+
* of names which can be used to insert the new nodes into the SemgrexMatcher
24+
* as named nodes. This will allow for further edits in the same edit step.
25+
* This list should be 0 indexed.
26+
* <br>
27+
* For example, this will split "foobar" and put the pos ADJ on the first word
28+
* <pre>
29+
* semgrex:
30+
* {word:/foobar/}=split
31+
* ssurgeon:
32+
* splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf
33+
* editNode -node asdf -pos ADJ
34+
* </pre>
2035
*
2136
* @author John Bauer
2237
*/
@@ -27,8 +42,9 @@ public class SplitWord extends SsurgeonEdit {
2742
final List<Pattern> nodeRegex;
2843
final int headIndex;
2944
final GrammaticalRelation relation;
45+
final Map<Integer, String> nodeNames;
3046

31-
public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
47+
public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
3248
if (node == null) {
3349
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
3450
}
@@ -54,6 +70,24 @@ public SplitWord(String node, List<String> nodeRegex, Integer headIndex, Grammat
5470
throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
5571
}
5672
this.relation = relation;
73+
74+
if (nodeNames != null) {
75+
String[] namePieces = nodeNames.split(",");
76+
this.nodeNames = new HashMap<>();
77+
for (String namePiece : namePieces) {
78+
String[] pieces = namePiece.split("=", 2);
79+
if (pieces.length < 2) {
80+
throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names. Should look like 0=foo,1=bar");
81+
}
82+
int idx = Integer.valueOf(pieces[0]);
83+
if (idx >= this.nodeRegex.size()) {
84+
throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
85+
}
86+
this.nodeNames.put(idx, pieces[1]);
87+
}
88+
} else {
89+
this.nodeNames = Collections.emptyMap();
90+
}
5791
}
5892

5993
@Override
@@ -114,8 +148,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
114148
matchedNode.setValue(words.get(headIndex));
115149

116150
for (int i = 0; i < nodeRegex.size(); ++i) {
117-
if (i == headIndex)
151+
if (i == headIndex) {
152+
if (nodeNames.containsKey(i)) {
153+
sm.putNode(nodeNames.get(i), matchedNode);
154+
}
118155
continue;
156+
}
119157

120158
// otherwise, add a word with the appropriate index,
121159
// then connect it to matchedNode
@@ -129,7 +167,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
129167

130168
sg.addVertex(newNode);
131169
sg.addEdge(matchedNode, newNode, relation, 0.0, false);
170+
171+
if (nodeNames.containsKey(i)) {
172+
sm.putNode(nodeNames.get(i), newNode);
173+
}
132174
}
175+
133176
return true;
134177
}
135178
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
625625
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
626626
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
627627
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
628-
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
628+
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
629629
}
630630
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
631631
} catch (SsurgeonParseException e) {

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

+38
Original file line numberDiff line numberDiff line change
@@ -2006,6 +2006,44 @@ public void readXMLSplitTwoWordsAfter() {
20062006
assertEquals(newSg, expected);
20072007
}
20082008

2009+
/**
2010+
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
2011+
*/
2012+
@Test
2013+
public void readXMLSplitTwoWordsNamed() {
2014+
String doc = String.join(newline,
2015+
"<ssurgeon-pattern-list>",
2016+
" <ssurgeon-pattern>",
2017+
" <uid>38</uid>",
2018+
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
2019+
" <language>UniversalEnglish</language>",
2020+
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
2021+
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf</edit-list>",
2022+
" <edit-list>editNode -node asdf -pos ADJ</edit-list>",
2023+
" </ssurgeon-pattern>",
2024+
"</ssurgeon-pattern-list>");
2025+
Ssurgeon inst = Ssurgeon.inst();
2026+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2027+
assertEquals(patterns.size(), 1);
2028+
SsurgeonPattern pattern = patterns.get(0);
2029+
2030+
SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
2031+
SemanticGraph newSg = pattern.iterate(sg).first;
2032+
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
2033+
assertEquals(newSg, expected);
2034+
2035+
boolean found = false;
2036+
for (IndexedWord word : newSg.vertexSet()) {
2037+
if (word.index() == 2) {
2038+
assertEquals("ADJ", word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
2039+
found = true;
2040+
} else {
2041+
assertEquals(null, word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
2042+
}
2043+
}
2044+
assertTrue(found);
2045+
}
2046+
20092047
/**
20102048
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
20112049
*/

0 commit comments

Comments
 (0)