Skip to content

Commit 2fce986

Browse files
committed
Add a node containment option to semgrex that works on CoreAnnotations which are Maps.
Currently the syntax is @, subject to change eg, morphofeatures@foo=bar In this expression, bar can be a regex, but foo and morphofeatures cannot. It might be worth adding regex capabilities for both of those, Also, !@ would be a useful addition. This checks at Semgrex compile time (not Java compile time) that the annotation used for key/value is actually a Map Has a test that the error checking and a simple search both work.
1 parent dcee001 commit 2fce986

File tree

5 files changed

+219
-58
lines changed

5 files changed

+219
-58
lines changed

src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java

+16
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,20 @@ public class NodeAttributes {
2626
// String, String, Boolean: key, value, negated
2727
private List<Triple<String, String, Boolean>> attributes;
2828
private Set<String> positiveAttributes;
29+
// Some annotations, especially morpho freatures (CoreAnnotations.CoNLLUFeats)
30+
// are represented by Maps. In some cases it will be easier to search
31+
// for individual elements of that map rather than turn the map into a string
32+
// and search on its contents that way. This is especially true since there
33+
// is no guarantee the map will be in a consistent order.
34+
// String, String, String: node attribute for a map (such as CoNLLUFeats), key in that map, value to match
35+
private List<Triple<String, String, String>> contains;
2936

3037
public NodeAttributes() {
3138
root = false;
3239
empty = false;
3340
attributes = new ArrayList<>();
3441
positiveAttributes = new HashSet<>();
42+
contains = new ArrayList<>();
3543
}
3644

3745
public void setRoot(boolean root) {
@@ -60,7 +68,15 @@ public void setAttribute(String key, String value, boolean negated) {
6068
attributes.add(new Triple(key, value, negated));
6169
}
6270

71+
public void addContains(String annotation, String key, String value) {
72+
contains.add(new Triple(annotation, key, value));
73+
}
74+
6375
public List<Triple<String, String, Boolean>> attributes() {
6476
return Collections.unmodifiableList(attributes);
6577
}
78+
79+
public List<Triple<String, String, String>> contains() {
80+
return Collections.unmodifiableList(contains);
81+
}
6682
}

src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java

+84-17
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import java.util.regex.Matcher;
99
import java.util.regex.Pattern;
1010

11+
import edu.stanford.nlp.ling.AnnotationLookup;
1112
import edu.stanford.nlp.ling.IndexedWord;
1213
import edu.stanford.nlp.semgraph.SemanticGraph;
1314
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
@@ -32,6 +33,11 @@ public class NodePattern extends SemgrexPattern {
3233
* Otherwise, the type will be a Pattern, and you must use Pattern.matches().
3334
*/
3435
private final List<Attribute> attributes;
36+
/**
37+
* Attributes which represent Maps (eg CoNLLUFeats)
38+
* and only partial matches are necessary
39+
*/
40+
private final List<Pair<String, Attribute>> partialAttributes;
3541
private final boolean isRoot;
3642
private final boolean isLink;
3743
private final boolean isEmpty;
@@ -58,6 +64,9 @@ public NodePattern(GraphRelation r, boolean negDesc,
5864
// order the attributes so that the pattern stays the same when
5965
// printing a compiled pattern
6066
this.attributes = new ArrayList<>();
67+
// same with partial attributes
68+
this.partialAttributes = new ArrayList<>();
69+
6170
descString = "{";
6271
for (Triple<String, String, Boolean> entry : attrs.attributes()) {
6372
if (!descString.equals("{"))
@@ -70,23 +79,7 @@ public NodePattern(GraphRelation r, boolean negDesc,
7079
if (value.equals("__")) {
7180
attributes.add(new Attribute(key, true, true, negated));
7281
} else if (value.matches("/.*/")) {
73-
boolean isRegexp = false;
74-
for (int i = 1; i < value.length() - 1; ++i) {
75-
char chr = value.charAt(i);
76-
if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) {
77-
isRegexp = true;
78-
break;
79-
}
80-
}
81-
String patternContent = value.substring(1, value.length() - 1);
82-
if (isRegexp) {
83-
attributes.add(new Attribute(key,
84-
Pattern.compile(patternContent),
85-
Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE),
86-
negated));
87-
} else {
88-
attributes.add(new Attribute(key, patternContent, patternContent, negated));
89-
}
82+
attributes.add(buildRegexAttribute(key, value, negated));
9083
} else { // raw description
9184
attributes.add(new Attribute(key, value, value, negated));
9285
}
@@ -98,6 +91,33 @@ public NodePattern(GraphRelation r, boolean negDesc,
9891
}
9992
}
10093

94+
for (Triple<String, String, String> entry : attrs.contains()) {
95+
String annotation = entry.first();
96+
String key = entry.second();
97+
String value = entry.third();
98+
99+
Class<?> clazz = AnnotationLookup.getValueType(AnnotationLookup.toCoreKey(annotation));
100+
boolean isMap = clazz != null && Map.class.isAssignableFrom(clazz);
101+
if (!isMap) {
102+
throw new SemgrexParseException("Cannot process a single key/value from annotation " + annotation + " as it is not a Map");
103+
}
104+
105+
final Attribute attr;
106+
// Add the attributes for this key
107+
if (value.equals("__")) {
108+
attr = new Attribute(key, true, true, false);
109+
} else if (value.matches("/.*/")) {
110+
attr = buildRegexAttribute(key, value, false);
111+
} else { // raw description
112+
attr = new Attribute(key, value, value, false);
113+
}
114+
partialAttributes.add(new Pair<>(annotation, attr));
115+
116+
if (!descString.equals("{"))
117+
descString += ";";
118+
descString += (annotation + "@" + key + "=" + value);
119+
}
120+
101121
if (attrs.root()) {
102122
if (!descString.equals("{"))
103123
descString += ";";
@@ -118,6 +138,30 @@ public NodePattern(GraphRelation r, boolean negDesc,
118138
this.variableGroups = Collections.unmodifiableList(variableGroups);
119139
}
120140

141+
/**
142+
* Tests the value to see if it's really a regex, or just a string wrapped in regex.
143+
* Return an Attribute which matches this expression
144+
*/
145+
private Attribute buildRegexAttribute(String key, String value, boolean negated) {
146+
boolean isRegexp = false;
147+
for (int i = 1; i < value.length() - 1; ++i) {
148+
char chr = value.charAt(i);
149+
if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) {
150+
isRegexp = true;
151+
break;
152+
}
153+
}
154+
String patternContent = value.substring(1, value.length() - 1);
155+
if (isRegexp) {
156+
return new Attribute(key,
157+
Pattern.compile(patternContent),
158+
Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE),
159+
negated);
160+
} else {
161+
return new Attribute(key, patternContent, patternContent, negated);
162+
}
163+
}
164+
121165
private boolean checkMatch(Attribute attr, boolean ignoreCase, String nodeValue) {
122166
if (nodeValue == null) {
123167
// treat non-existent attributes has having matched a negated expression
@@ -189,6 +233,29 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i
189233
return negDesc;
190234
}
191235
}
236+
for (Pair<String, Attribute> partialAttribute : partialAttributes) {
237+
String annotation = partialAttribute.first();
238+
Attribute attr = partialAttribute.second();
239+
240+
Class clazz = Env.lookupAnnotationKey(env, annotation);
241+
Object rawmap = node.get(clazz);
242+
// if the map is null, it can't possibly match...
243+
if (rawmap == null) {
244+
return negDesc;
245+
}
246+
if (!(rawmap instanceof Map))
247+
throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!");
248+
Map<String, ?> map = (Map) rawmap;
249+
250+
// TODO: allow for regex match on the keys?
251+
Object value = map.get(attr.key);
252+
final String nodeValue = (value == null) ? null : value.toString();
253+
boolean matches = checkMatch(attr, ignoreCase, nodeValue);
254+
if (!matches) {
255+
return negDesc;
256+
}
257+
}
258+
192259
// System.out.println("matches");
193260
// System.out.println("");
194261
return !negDesc;

0 commit comments

Comments
 (0)