Skip to content

Commit 156fad1

Browse files
committed
Add to Ssurgeon a ReindexGraph operation which recounts the indices on the nodes
1 parent 499eb5b commit 156fad1

File tree

3 files changed

+113
-1
lines changed

3 files changed

+113
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
import java.io.StringWriter;
4+
import java.util.ArrayList;
5+
import java.util.Collections;
6+
import java.util.HashMap;
7+
import java.util.List;
8+
import java.util.Map;
9+
10+
import edu.stanford.nlp.ling.IndexedWord;
11+
import edu.stanford.nlp.semgraph.SemanticGraph;
12+
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
13+
14+
/**
15+
* Go through the nodes. Reindex them all so that the starting index is 1,
16+
* then all the other indices follow from there.
17+
*
18+
* Useful in cases such as when a dependency graph is split into two pieces,
19+
* perhaps via manual edits
20+
*
21+
* @author John Bauer
22+
*
23+
*/
24+
public class ReindexGraph extends SsurgeonEdit {
25+
public static final String LABEL = "reindexGraph";
26+
27+
public ReindexGraph() {
28+
}
29+
30+
@Override
31+
public String toEditString() {
32+
StringWriter buf = new StringWriter();
33+
buf.write(LABEL);
34+
return buf.toString();
35+
}
36+
37+
@Override
38+
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
39+
// we keep indices in case there are empty nodes or copy nodes
40+
// for which we want to use the same index twice
41+
Map<Integer, Integer> knownEdits = new HashMap<>();
42+
boolean changed = false;
43+
44+
List<IndexedWord> vertices = new ArrayList<>(sg.vertexSet());
45+
Collections.sort(vertices);
46+
47+
int nextIndex = 1;
48+
for (IndexedWord vertex : vertices) {
49+
final int index;
50+
if (knownEdits.containsKey(vertex.index())) {
51+
index = knownEdits.get(vertex.index());
52+
} else {
53+
index = nextIndex;
54+
nextIndex++;
55+
knownEdits.put(vertex.index(), index);
56+
}
57+
if (index != vertex.index()) {
58+
changed = true;
59+
SsurgeonUtils.moveNode(sg, sm, vertex, index);
60+
}
61+
}
62+
63+
return changed;
64+
}
65+
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
* <li> {@code delete -node node}
9393
* <li> {@code deleteLeaf -node node}
9494
* <li> {@code killNonRootedNodes}
95+
* <li> {@code reindexGraph}
9596
* </ul>
9697
*
9798
*<p>
@@ -177,7 +178,9 @@
177178
* If the node is not a leaf (no outgoing edges), it will not be deleted.
178179
*</p><p>
179180
* {@code killNonRootedNodes} searches the graph and deletes all nodes which have no path to a root.
180-
*</p>
181+
*</p><p>
182+
* {@code reindexGraph} reindexes the graph from 1 in case there are gaps or the node indices start later than 1. (Warning: does not work for first index less than 1)
183+
*</P>
181184
*<p>
182185
* A practical example comes from the {@code UD_English-Pronouns}
183186
* dataset, where some words had both {@code nsubj} and {@code csubj}
@@ -627,6 +630,8 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
627630
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
628631
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
629632
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
633+
} else if (command.equalsIgnoreCase(ReindexGraph.LABEL)) {
634+
return new ReindexGraph();
630635
}
631636
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
632637
} catch (SsurgeonParseException e) {

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

+42
Original file line numberDiff line numberDiff line change
@@ -2088,6 +2088,48 @@ public void readXMLSplitTwoWordsNamed() {
20882088
assertTrue(found);
20892089
}
20902090

2091+
/**
2092+
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
2093+
*/
2094+
@Test
2095+
public void readXMLReindexGraph() {
2096+
String doc = String.join(newline,
2097+
"<ssurgeon-pattern-list>",
2098+
" <ssurgeon-pattern>",
2099+
" <uid>38</uid>",
2100+
" <notes>Reindex all nodes to have a base index of 1</notes>",
2101+
" <language>UniversalEnglish</language>",
2102+
" <semgrex>" + XMLUtils.escapeXML("{$}") + "</semgrex>",
2103+
" <edit-list>reindexGraph</edit-list>",
2104+
" </ssurgeon-pattern>",
2105+
"</ssurgeon-pattern-list>");
2106+
Ssurgeon inst = Ssurgeon.inst();
2107+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2108+
assertEquals(patterns.size(), 1);
2109+
SsurgeonPattern pattern = patterns.get(0);
2110+
2111+
SemanticGraph sg = SemanticGraph.valueOf("[example-5 det> the-2 amod> foobar-4]");
2112+
SemanticGraph newSg = pattern.iterate(sg).first;
2113+
SemanticGraph expected = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
2114+
2115+
Map<String, Integer> expectedIndices = new HashMap<String, Integer>() {{
2116+
put("example", 3);
2117+
put("the", 1);
2118+
put("foobar", 2);
2119+
}};
2120+
// iterate & assert the indices separately so that if something goes wrong,
2121+
// it is clear what the error is
2122+
// the indices are supposed to be remapped to be 1, 2, 3
2123+
for (IndexedWord vertex : newSg.vertexSet()) {
2124+
assertTrue(expectedIndices.containsKey(vertex.word()));
2125+
int index = vertex.index();
2126+
int expectedIndex = expectedIndices.get(vertex.word());
2127+
assertEquals(index, expectedIndex);
2128+
}
2129+
2130+
assertEquals(newSg, expected);
2131+
}
2132+
20912133
/**
20922134
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
20932135
*/

0 commit comments

Comments
 (0)