Add to Ssurgeon a ReindexGraph operation which recounts the indices on the nodes

AngledLuffa · AngledLuffa · commit 156fad1352d7 · 2025-03-04T23:26:04.000-08:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/ReindexGraph.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/ReindexGraph.java
@@ -0,0 +1,65 @@
+package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
+
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
+
+/**
+ * Go through the nodes.  Reindex them all so that the starting index is 1,
+ * then all the other indices follow from there.
+ *
+ * Useful in cases such as when a dependency graph is split into two pieces,
+ * perhaps via manual edits
+ *
+ * @author John Bauer
+ *
+ */
+public class ReindexGraph extends SsurgeonEdit {
+  public static final String LABEL = "reindexGraph";
+
+  public ReindexGraph() {
+  }
+
+  @Override
+  public String toEditString() {
+    StringWriter buf = new StringWriter();
+    buf.write(LABEL);
+    return buf.toString();
+  }
+
+  @Override
+  public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
+    // we keep indices in case there are empty nodes or copy nodes
+    // for which we want to use the same index twice
+    Map<Integer, Integer> knownEdits = new HashMap<>();
+    boolean changed = false;
+
+    List<IndexedWord> vertices = new ArrayList<>(sg.vertexSet());
+    Collections.sort(vertices);
+
+    int nextIndex = 1;
+    for (IndexedWord vertex : vertices) {
+      final int index;
+      if (knownEdits.containsKey(vertex.index())) {
+        index = knownEdits.get(vertex.index());
+      } else {
+        index = nextIndex;
+        nextIndex++;
+        knownEdits.put(vertex.index(), index);
+      }
+      if (index != vertex.index()) {
+        changed = true;
+        SsurgeonUtils.moveNode(sg, sm, vertex, index);
+      }
+    }
+
+    return changed;
+  }
+}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -92,6 +92,7 @@
  * <li> {@code delete -node node}
  * <li> {@code deleteLeaf -node node}
  * <li> {@code killNonRootedNodes}
+ * <li> {@code reindexGraph}
  * </ul>
  *
  *<p>
@@ -177,7 +178,9 @@
  * If the node is not a leaf (no outgoing edges), it will not be deleted.
  *</p><p>
  * {@code killNonRootedNodes} searches the graph and deletes all nodes which have no path to a root.
- *</p>
+ *</p><p>
+ * {@code reindexGraph} reindexes the graph from 1 in case there are gaps or the node indices start later than 1.  (Warning: does not work for first index less than 1)
+ *</P>
  *<p>
  * A practical example comes from the {@code UD_English-Pronouns}
  * dataset, where some words had both {@code nsubj} and {@code csubj}
@@ -627,6 +630,8 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
       } else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
         GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
         return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
+      } else if (command.equalsIgnoreCase(ReindexGraph.LABEL)) {
+        return new ReindexGraph();
       }
       throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
     } catch (SsurgeonParseException e) {
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -2088,6 +2088,48 @@ public void readXMLSplitTwoWordsNamed() {
     assertTrue(found);
   }
 
+  /**
+   * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
+   */
+  @Test
+  public void readXMLReindexGraph() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Reindex all nodes to have a base index of 1</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{$}") + "</semgrex>",
+                             "    <edit-list>reindexGraph</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-5 det> the-2 amod> foobar-4]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+
+    Map<String, Integer> expectedIndices = new HashMap<String, Integer>() {{
+        put("example", 3);
+        put("the", 1);
+        put("foobar", 2);
+      }};
+    // iterate & assert the indices separately so that if something goes wrong,
+    // it is clear what the error is
+    // the indices are supposed to be remapped to be 1, 2, 3
+    for (IndexedWord vertex : newSg.vertexSet()) {
+      assertTrue(expectedIndices.containsKey(vertex.word()));
+      int index = vertex.index();
+      int expectedIndex = expectedIndices.get(vertex.word());
+      assertEquals(index, expectedIndex);
+    }
+
+    assertEquals(newSg, expected);
+  }
+
   /**
    * Test splitWord, which should split a word into pieces based on regex matches, with three pieces
    */