例如,来自斯坦福情感树银行的解析树
"(2 (2 (2 near) (2 (2 the) (2 end))) (3 (3 (2 takes) (2 (2 on) (2 (2 a) (2 (2 whole) (2 (2 other) (2 meaning)))))) (2 .)))",
其中,数字是每个节点的情感标签。
我希望向每个节点添加POS标记信息。例如:
"(NP (ADJP (IN near)) (DT the) (NN end)) "
我曾尝试直接解析句子,但结果树与情感树中的树不同(可能是因为解析版本或参数,我试图与作者联系,但没有响应)。
如何获取标记信息?
发布于 2016-05-11 07:42:55
我认为edu.stanford.nlp.sentiment.BuildBinarizedDataset中的代码应该是有用的。main()方法逐步介绍如何在Java代码中创建这些二叉树。
代码中需要注意的一些关键行:
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
...
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);您可以从Tree对象访问节点标记信息。您应该查看edu.stanford.nlp.trees.Tree的javadoc,看看如何访问这些信息。
同样在这个答案中,我有一些代码显示了访问树的情况:
您需要查看每个树和子树的标签(),以获取节点的标记。
以下是GitHub上对BuildBinarizedDataset.java的引用:
如果有什么不清楚的地方,请告诉我,我可以提供进一步的帮助!
发布于 2017-05-12 19:16:19
首先,您需要下载斯坦福分析器
设置
private LexicalizedParser parser;
private TreeBinarizer binarizer;
private CollapseUnaryTransformer transformer;
parser = LexicalizedParser.loadModel(PCFG_PATH);
binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
transformer = new CollapseUnaryTransformer();解析
Tree tree = parser.apply(tokens);存取POSTAG
public String[] constTreePOSTAG(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
String[] tags = new String[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
tags[curIdx] = cur.label().toString();
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
tags[curIdx] = parent.label().toString();
cur = parent;
curIdx = parentIdx;
}
}
return tags;
}下面是运行的完整源代码ConstituencyParse.java : Use : java ConstituencyParse -tokpath outputtoken.toks -parentpath outputparent.txt -tagpath outputag.txt <
(注意:源代码来自树干回购,您还需要替换preprocess-sst.py来调用下面的ConstituencyParse.java文件)
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.HashMap;
import java.util.Properties;
import java.util.Scanner;
public class ConstituencyParse {
private boolean tokenize;
private BufferedWriter tokWriter, parentWriter, tagWriter;
private LexicalizedParser parser;
private TreeBinarizer binarizer;
private CollapseUnaryTransformer transformer;
private GrammaticalStructureFactory gsf;
private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
public ConstituencyParse(String tokPath, String parentPath, String tagPath, boolean tokenize) throws IOException {
this.tokenize = tokenize;
if (tokPath != null) {
tokWriter = new BufferedWriter(new FileWriter(tokPath));
}
parentWriter = new BufferedWriter(new FileWriter(parentPath));
tagWriter = new BufferedWriter(new FileWriter(tagPath));
parser = LexicalizedParser.loadModel(PCFG_PATH);
binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
transformer = new CollapseUnaryTransformer();
// set up to produce dependency representations from constituency trees
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
gsf = tlp.grammaticalStructureFactory();
}
public List<HasWord> sentenceToTokens(String line) {
List<HasWord> tokens = new ArrayList<>();
if (tokenize) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
for (Word label; tokenizer.hasNext(); ) {
tokens.add(tokenizer.next());
}
} else {
for (String word : line.split(" ")) {
tokens.add(new Word(word));
}
}
return tokens;
}
public Tree parse(List<HasWord> tokens) {
Tree tree = parser.apply(tokens);
return tree;
}
public String[] constTreePOSTAG(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
String[] tags = new String[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
tags[curIdx] = cur.label().toString();
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
tags[curIdx] = parent.label().toString();
cur = parent;
curIdx = parentIdx;
}
}
return tags;
}
public int[] constTreeParents(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
int[] parents = new int[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
parents[curIdx] = 0;
break;
}
int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}
parents[curIdx] = parentIdx + 1;
cur = parent;
curIdx = parentIdx;
}
}
return parents;
}
// convert constituency parse to a dependency representation and return the
// parent pointer representation of the tree
public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependencies();
int len = tokens.size();
int[] parents = new int[len];
for (int i = 0; i < len; i++) {
// if a node has a parent of -1 at the end of parsing, then the node
// has no parent.
parents[i] = -1;
}
for (TypedDependency td : tdl) {
// let root have index 0
int child = td.dep().index();
int parent = td.gov().index();
parents[child - 1] = parent;
}
return parents;
}
public void printTokens(List<HasWord> tokens) throws IOException {
int len = tokens.size();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
} else {
sb.append(tokens.get(i).word());
}
sb.append(' ');
}
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
} else {
sb.append(tokens.get(len - 1).word());
}
sb.append('\n');
tokWriter.write(sb.toString());
}
public void printParents(int[] parents) throws IOException {
StringBuilder sb = new StringBuilder();
int size = parents.length;
for (int i = 0; i < size - 1; i++) {
sb.append(parents[i]);
sb.append(' ');
}
sb.append(parents[size - 1]);
sb.append('\n');
parentWriter.write(sb.toString());
}
public void printTags(String[] tags) throws IOException {
StringBuilder sb = new StringBuilder();
int size = tags.length;
for (int i = 0; i < size - 1; i++) {
sb.append(tags[i]);
sb.append(' ');
}
sb.append(tags[size - 1]);
sb.append('\n');
tagWriter.write(sb.toString().toLowerCase());
}
public void close() throws IOException {
if (tokWriter != null) tokWriter.close();
parentWriter.close();
tagWriter.close();
}
public static void main(String[] args) throws Exception {
String TAGGER_MODEL = "stanford-tagger/models/english-left3words-distsim.tagger";
Properties props = StringUtils.argsToProperties(args);
if (!props.containsKey("parentpath")) {
System.err.println(
"usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
System.exit(1);
}
// whether to tokenize input sentences
boolean tokenize = false;
if (props.containsKey("tokenize")) {
tokenize = true;
}
// whether to produce dependency trees from the constituency parse
boolean deps = false;
if (props.containsKey("deps")) {
deps = true;
}
String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
String parentPath = props.getProperty("parentpath");
String tagPath = props.getProperty("tagpath");
ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tagPath, tokenize);
Scanner stdin = new Scanner(System.in);
int count = 0;
long start = System.currentTimeMillis();
while (stdin.hasNextLine() && count < 2) {
String line = stdin.nextLine();
List<HasWord> tokens = processor.sentenceToTokens(line);
//end tagger
Tree parse = processor.parse(tokens);
// produce parent pointer representation
int[] parents = deps ? processor.depTreeParents(parse, tokens)
: processor.constTreeParents(parse);
String[] tags = processor.constTreePOSTAG(parse);
// print
if (tokPath != null) {
processor.printTokens(tokens);
}
processor.printParents(parents);
processor.printTags(tags);
// print tag
StringBuilder sb = new StringBuilder();
int size = tags.length;
for (int i = 0; i < size - 1; i++) {
sb.append(tags[i]);
sb.append(' ');
}
sb.append(tags[size - 1]);
sb.append('\n');
count++;
if (count % 100 == 0) {
double elapsed = (System.currentTimeMillis() - start) / 1000.0;
System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
}
}
long totalTimeMillis = System.currentTimeMillis() - start;
System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
count, totalTimeMillis / 100.0, totalTimeMillis / (double) count);
processor.close();
}
}https://stackoverflow.com/questions/37112661
复制相似问题