001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.Iterator;
011 import java.util.LinkedHashMap;
012 import java.util.SortedMap;
013 import java.util.TreeMap;
014 import java.util.regex.PatternSyntaxException;
015
016 import org.maltparser.core.exception.MaltChainedException;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.PhraseStructure;
021 import org.maltparser.core.syntaxgraph.TokenStructure;
022 import org.maltparser.core.syntaxgraph.edge.Edge;
023 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025 import org.maltparser.ml.libsvm.LibsvmException;
026 /**
027 *
028 *
029 * @author Johan Hall
030 */
031 public class NegraWriter implements SyntaxGraphWriter {
032 private BufferedWriter writer;
033 private DataFormatInstance dataFormatInstance;
034 private String optionString;
035 private int sentenceCount;
036 private LinkedHashMap<Integer, Integer> nonTerminalIndexMap;
037 private int START_ID_OF_NONTERMINALS = 500;
038 private boolean closeStream = true;
039
040 public NegraWriter() {
041 nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>();
042 }
043
044 public void open(String fileName, String charsetName) throws MaltChainedException {
045 try {
046 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
047 } catch (FileNotFoundException e) {
048 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
049 } catch (UnsupportedEncodingException e) {
050 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
051 }
052 }
053
054 public void open(OutputStream os, String charsetName) throws MaltChainedException {
055 try {
056 if (os == System.out || os == System.err) {
057 closeStream = false;
058 }
059 open(new OutputStreamWriter(os, charsetName));
060 } catch (UnsupportedEncodingException e) {
061 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
062 }
063 }
064
065 private void open(OutputStreamWriter osw) throws MaltChainedException {
066 setWriter(new BufferedWriter(osw));
067 setSentenceCount(0);
068 }
069
070 public void writeProlog() throws MaltChainedException { }
071
072 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
073 if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) {
074 return;
075 }
076 PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
077 sentenceCount++;
078 try {
079 writer.write("#BOS ");
080 if (phraseStructure.getSentenceID() != 0) {
081 writer.write(Integer.toString(phraseStructure.getSentenceID()));
082 } else {
083 writer.write(Integer.toString(sentenceCount));
084 }
085 writer.write('\n');
086
087 if (phraseStructure.hasNonTerminals()) {
088 calculateIndices(phraseStructure);
089 writeTerminals(phraseStructure);
090 writeNonTerminals(phraseStructure);
091 } else {
092 writeTerminals(phraseStructure);
093 }
094 writer.write("#EOS ");
095 if (phraseStructure.getSentenceID() != 0) {
096 writer.write(Integer.toString(phraseStructure.getSentenceID()));
097 } else {
098 writer.write(Integer.toString(sentenceCount));
099 }
100 writer.write('\n');
101 } catch (IOException e) {
102 throw new DataFormatException("Could not write to the output file. ", e);
103 }
104 }
105 public void writeEpilog() throws MaltChainedException { }
106
107
108 private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException {
109 final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
110 for (int index : phraseStructure.getNonTerminalIndices()) {
111 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
112 }
113
114 boolean done = false;
115 int h = 1;
116 int ntid = START_ID_OF_NONTERMINALS;
117 nonTerminalIndexMap.clear();
118 while (!done) {
119 done = true;
120 for (int index : phraseStructure.getNonTerminalIndices()) {
121 if (heights.get(index) == h) {
122 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
123 nonTerminalIndexMap.put(nt.getIndex(), ntid++);
124 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
125 done = false;
126 }
127 }
128 h++;
129 }
130
131 // boolean done = false;
132 // int h = 1;
133 //// int ntid = START_ID_OF_NONTERMINALS;
134 //// nonTerminalIndexMap.clear();
135 // while (!done) {
136 // done = true;
137 // for (int index : phraseStructure.getNonTerminalIndices()) {
138 // if (heights.get(index) == h) {
139 // NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
140 //// nonTerminalIndexMap.put(nt.getIndex(), ntid++);
141 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
142 // done = false;
143 // }
144 // }
145 // h++;
146 // }
147 }
148
149 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
150 try {
151 for (int index : phraseStructure.getTokenIndices()) {
152 final PhraseStructureNode terminal = phraseStructure.getTokenNode(index);
153 final Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
154 ColumnDescription column = null;
155 int ti = 1;
156 while (columns.hasNext()) {
157 column = columns.next();
158 if (column.getCategory() == ColumnDescription.INPUT) {
159 writer.write(terminal.getLabelSymbol(column.getSymbolTable()));
160 int nTabs = 1;
161 if (ti == 1 || ti == 2) {
162 nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
163 } else if (ti == 3) {
164 nTabs = 1;
165 } else if (ti == 4) {
166 nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
167 }
168 if (nTabs < 1) {
169 nTabs = 1;
170 }
171 for (int j = 0; j < nTabs; j++) {
172 writer.write('\t');
173 }
174 ti++;
175 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
176 if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) {
177 writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable()));
178 writer.write('\t');
179 } else {
180 writer.write("--\t");
181 }
182 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) {
183 if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) {
184 writer.write('0');
185 } else {
186 writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex())));
187 // writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
188 }
189 }
190 }
191 for (Edge e : terminal.getIncomingSecondaryEdges()) {
192 if (e.hasLabel(column.getSymbolTable())) {
193 writer.write('\t');
194 writer.write(e.getLabelSymbol(column.getSymbolTable()));
195 writer.write('\t');
196 if (e.getSource() instanceof NonTerminalNode) {
197 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
198 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
199 } else {
200 writer.write(Integer.toString(e.getSource().getIndex()));
201 }
202 }
203 }
204 writer.write("\n");
205 }
206
207 } catch (IOException e) {
208 throw new DataFormatException("The Negra writer is not able to write. ", e);
209 }
210 }
211
212 private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
213 for (int index : nonTerminalIndexMap.keySet()) {
214 // for (int index : phraseStructure.getNonTerminalIndices()) {
215 NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
216
217 if (nonTerminal == null || nonTerminal.isRoot()) {
218 return;
219 }
220 try {
221 writer.write('#');
222 // writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1));
223 writer.write(Integer.toString(nonTerminalIndexMap.get(index)));
224 writer.write("\t\t\t--\t\t\t");
225 if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) {
226 writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable()));
227 } else {
228 writer.write("--");
229 }
230 writer.write("\t--\t\t");
231 if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) {
232 writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable()));
233 } else {
234 writer.write("--");
235 }
236 writer.write('\t');
237 if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) {
238 writer.write('0');
239 } else {
240 // writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
241 writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex())));
242 }
243 for (Edge e : nonTerminal.getIncomingSecondaryEdges()) {
244 if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) {
245 writer.write('\t');
246 writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable()));
247 writer.write('\t');
248 if (e.getSource() instanceof NonTerminalNode) {
249 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
250 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
251 } else {
252 writer.write(Integer.toString(e.getSource().getIndex()));
253 }
254 }
255 }
256 writer.write("\n");
257 } catch (IOException e) {
258 throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e);
259 }
260 }
261 }
262
263 public BufferedWriter getWriter() {
264 return writer;
265 }
266
267 public void setWriter(BufferedWriter writer) {
268 this.writer = writer;
269 }
270
271 public int getSentenceCount() {
272 return sentenceCount;
273 }
274
275 public void setSentenceCount(int sentenceCount) {
276 this.sentenceCount = sentenceCount;
277 }
278
279 public DataFormatInstance getDataFormatInstance() {
280 return dataFormatInstance;
281 }
282
283 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
284 this.dataFormatInstance = dataFormatInstance;
285 }
286
287 public String getOptions() {
288 return optionString;
289 }
290
291 public void setOptions(String optionString) throws MaltChainedException {
292 this.optionString = optionString;
293 String[] argv;
294 try {
295 argv = optionString.split("[_\\p{Blank}]");
296 } catch (PatternSyntaxException e) {
297 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
298 }
299 for (int i=0; i < argv.length-1; i++) {
300 if(argv[i].charAt(0) != '-') {
301 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
302 }
303 if(++i>=argv.length) {
304 throw new DataFormatException("The last argument does not have any value. ");
305 }
306 switch(argv[i-1].charAt(1)) {
307 case 's':
308 try {
309 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
310 } catch (NumberFormatException e){
311 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
312 }
313 break;
314 default:
315 throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
316 }
317 }
318 }
319
320 public void close() throws MaltChainedException {
321 try {
322 if (writer != null) {
323 writer.flush();
324 if (closeStream) {
325 writer.close();
326 }
327 writer = null;
328 }
329 } catch (IOException e) {
330 throw new DataFormatException("Could not close the output file. ", e);
331 }
332 }
333 }