Add the ability to mark newly created nodes with names in the SemgrexMatcher, allowing for a compound operation which then assigns more fields to that node

AngledLuffa · AngledLuffa · commit 0e39b3731d12 · 2024-07-02T13:46:22.000-07:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
@@ -17,6 +17,21 @@
  * stuck to each of the words.  We can separate that out by using two
  * regex, one which matches the " in a group, one which matches the
  * rest of the word without the "
+ * <br>
+ * Aside from the text and the dependency, the new node is rather bare bones.
+ * Adding the -name argument allows for specifying a comma-separate list
+ * of names which can be used to insert the new nodes into the SemgrexMatcher
+ * as named nodes.  This will allow for further edits in the same edit step.
+ * This list should be 0 indexed.
+ * <br>
+ * For example, this will split "foobar" and put the pos ADJ on the first word
+ * <pre>
+ * semgrex:
+ *   {word:/foobar/}=split
+ * ssurgeon:
+ *   splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf
+ *   editNode -node asdf -pos ADJ
+ * </pre>
  *
  * @author John Bauer
  */
@@ -27,8 +42,9 @@ public class SplitWord extends SsurgeonEdit {
   final List<Pattern> nodeRegex;
   final int headIndex;
   final GrammaticalRelation relation;
+  final Map<Integer, String> nodeNames;
 
-  public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
+  public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
     if (node == null) {
       throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
     }
@@ -54,6 +70,24 @@ public SplitWord(String node, List<String> nodeRegex, Integer headIndex, Grammat
       throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
     }
     this.relation = relation;
+
+    if (nodeNames != null) {
+      String[] namePieces = nodeNames.split(",");
+      this.nodeNames = new HashMap<>();
+      for (String namePiece : namePieces) {
+        String[] pieces = namePiece.split("=", 2);
+        if (pieces.length < 2) {
+          throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names.  Should look like 0=foo,1=bar");
+        }
+        int idx = Integer.valueOf(pieces[0]);
+        if (idx >= this.nodeRegex.size()) {
+          throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
+        }
+        this.nodeNames.put(idx, pieces[1]);
+      }
+    } else {
+      this.nodeNames = Collections.emptyMap();
+    }
   }
 
   @Override
@@ -114,8 +148,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     matchedNode.setValue(words.get(headIndex));
 
     for (int i = 0; i < nodeRegex.size(); ++i) {
-      if (i == headIndex)
+      if (i == headIndex) {
+        if (nodeNames.containsKey(i)) {
+          sm.putNode(nodeNames.get(i), matchedNode);
+        }
         continue;
+      }
 
       // otherwise, add a word with the appropriate index,
       // then connect it to matchedNode
@@ -129,7 +167,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
 
       sg.addVertex(newNode);
       sg.addEdge(matchedNode, newNode, relation, 0.0, false);
+
+      if (nodeNames.containsKey(i)) {
+        sm.putNode(nodeNames.get(i), newNode);
+      }
     }
+
     return true;
   }
 }
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
         return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
       } else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
         GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
-        return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
+        return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
       }
       throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
     } catch (SsurgeonParseException e) {
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -2006,6 +2006,44 @@ public void readXMLSplitTwoWordsAfter() {
     assertEquals(newSg, expected);
   }
 
+  /**
+   * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
+   */
+  @Test
+  public void readXMLSplitTwoWordsNamed() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Test splitting a word into two pieces with the head at the start</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
+                             "    <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf</edit-list>",
+                             "    <edit-list>editNode -node asdf -pos ADJ</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
+    assertEquals(newSg, expected);
+
+    boolean found = false;
+    for (IndexedWord word : newSg.vertexSet()) {
+      if (word.index() == 2) {
+        assertEquals("ADJ", word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
+        found = true;
+      } else {
+        assertEquals(null, word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
+      }
+    }
+    assertTrue(found);
+  }
+
   /**
    * Test splitWord, which should split a word into pieces based on regex matches, with three pieces
    */

Original file line number	Diff line number	Diff line change
`@@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at`
`625`	`625`	`return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));`
`626`	`626`	`} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {`
`627`	`627`	`GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);`
`628`		`- return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);`
	`628`	`+ return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);`
`629`	`629`	`}`
`630`	`630`	`throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");`
`631`	`631`	`} catch (SsurgeonParseException e) {`