001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.SortedMap;
011 import java.util.regex.PatternSyntaxException;
012
013 import org.maltparser.core.exception.MaltChainedException;
014 import org.maltparser.core.io.dataformat.ColumnDescription;
015 import org.maltparser.core.io.dataformat.DataFormatException;
016 import org.maltparser.core.io.dataformat.DataFormatInstance;
017 import org.maltparser.core.symbol.SymbolTable;
018 import org.maltparser.core.syntaxgraph.PhraseStructure;
019 import org.maltparser.core.syntaxgraph.TokenStructure;
020 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
021 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
022 import org.maltparser.core.syntaxgraph.node.TokenNode;
023 import org.maltparser.ml.libsvm.LibsvmException;
024 /**
025 *
026 *
027 * @author Johan Hall
028 */
029 public class BracketWriter implements SyntaxGraphWriter {
030 private enum PennWriterFormat {
031 DEFAULT, PRETTY
032 };
033 private PennWriterFormat format;
034 private BufferedWriter writer;
035 private DataFormatInstance dataFormatInstance;
036 private SortedMap<String,ColumnDescription> inputColumns;
037 private SortedMap<String,ColumnDescription> edgeLabelColumns;
038 private SortedMap<String,ColumnDescription> phraseLabelColumns;
039 private char STARTING_BRACKET = '(';
040 private String EMPTY_EDGELABEL = "??";
041 private char CLOSING_BRACKET = ')';
042 private char INPUT_SEPARATOR = ' ';
043 private char EDGELABEL_SEPARATOR = '-';
044 private char SENTENCE_SEPARATOR = '\n';
045 private String optionString;
046 private boolean closeStream = true;
047
048 public BracketWriter() {
049 }
050
051 public void open(String fileName, String charsetName) throws MaltChainedException {
052 try {
053 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
054 } catch (FileNotFoundException e) {
055 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
056 } catch (UnsupportedEncodingException e) {
057 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
058 }
059 }
060
061 public void open(OutputStream os, String charsetName) throws MaltChainedException {
062 try {
063 if (os == System.out || os == System.err) {
064 closeStream = false;
065 }
066 open(new OutputStreamWriter(os, charsetName));
067 } catch (UnsupportedEncodingException e) {
068 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
069 }
070 }
071
072 private void open(OutputStreamWriter osw) throws MaltChainedException {
073 setWriter(new BufferedWriter(osw));
074 }
075
076 public void writeEpilog() throws MaltChainedException {
077
078 }
079
080 public void writeProlog() throws MaltChainedException {
081
082 }
083
084 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
085 if (syntaxGraph == null || dataFormatInstance == null) {
086 return;
087 }
088 if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) {
089 // PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph);
090 if (format == PennWriterFormat.PRETTY) {
091 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0);
092 } else {
093 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot());
094 }
095 try {
096 writer.write(SENTENCE_SEPARATOR);
097 writer.flush();
098 } catch (IOException e) {
099 close();
100 throw new DataFormatException("Could not write to the output file. ", e);
101 }
102 }
103 }
104
105 private void writeElement(PhraseStructureNode element) throws MaltChainedException {
106 try {
107 if (element instanceof TokenNode) {
108 PhraseStructureNode t = (PhraseStructureNode)element;
109 SymbolTable table = null;
110 writer.write(STARTING_BRACKET);
111 int i = 0;
112 for (String inputColumn : inputColumns.keySet()) {
113 if (i != 0) {
114 writer.write(INPUT_SEPARATOR);
115 }
116 table = inputColumns.get(inputColumn).getSymbolTable();
117 if (t.hasLabel(table)) {
118 writer.write(t.getLabelSymbol(table));
119 }
120 if (i == 0) {
121 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
122 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
123 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
124 writer.write(EDGELABEL_SEPARATOR);
125 writer.write(t.getParentEdgeLabelSymbol(table));
126 }
127 }
128 }
129 i++;
130 }
131 writer.write(CLOSING_BRACKET);
132 } else {
133 NonTerminalNode nt = (NonTerminalNode)element;
134 writer.write(STARTING_BRACKET);
135 SymbolTable table = null;
136 int i = 0;
137 for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
138 if (i != 0) {
139 writer.write(INPUT_SEPARATOR);
140 }
141 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
142 if (nt.hasLabel(table)) {
143 writer.write(nt.getLabelSymbol(table));
144 }
145 if (i == 0) {
146 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
147 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
148 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
149 writer.write(EDGELABEL_SEPARATOR);
150 writer.write(nt.getParentEdgeLabelSymbol(table));
151 }
152 }
153 }
154 i++;
155 }
156 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
157 writeElement(node);
158 }
159 writer.write(CLOSING_BRACKET);
160 }
161 } catch (IOException e) {
162 throw new DataFormatException("Could not write to the output file. ", e);
163 }
164 }
165
166 private String getIndentation(int depth) {
167 StringBuilder sb = new StringBuilder("");
168 for (int i = 0; i < depth; i++) {
169 sb.append("\t");
170 }
171 return sb.toString();
172 }
173
174 private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException {
175 try {
176 if (element instanceof TokenNode) {
177 PhraseStructureNode t = (PhraseStructureNode)element;
178 SymbolTable table = null;
179 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
180 int i = 0;
181 for (String inputColumn : inputColumns.keySet()) {
182 if (i != 0) {
183 writer.write(INPUT_SEPARATOR);
184 }
185 table = inputColumns.get(inputColumn).getSymbolTable();
186 if (t.hasLabel(table)) {
187 writer.write(encodeString(t.getLabelSymbol(table)));
188 }
189 if (i == 0) {
190 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
191 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
192 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
193 writer.write(EDGELABEL_SEPARATOR);
194 writer.write(t.getParentEdgeLabelSymbol(table));
195 }
196 }
197 }
198 i++;
199 }
200 writer.write(CLOSING_BRACKET);
201 } else {
202 NonTerminalNode nt = (NonTerminalNode)element;
203 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
204 SymbolTable table = null;
205 int i = 0;
206 for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
207 if (i != 0) {
208 writer.write(INPUT_SEPARATOR);
209 }
210 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
211 if (nt.hasLabel(table)) {
212 writer.write(nt.getLabelSymbol(table));
213 }
214 if (i == 0) {
215 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
216 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
217 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
218 writer.write(EDGELABEL_SEPARATOR);
219 writer.write(nt.getParentEdgeLabelSymbol(table));
220 }
221 }
222 }
223 i++;
224 }
225 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
226 writeElement(node, depth + 1);
227 }
228 writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET);
229 }
230 } catch (IOException e) {
231 throw new DataFormatException("Could not write to the output file. ", e);
232 }
233 }
234
235 public BufferedWriter getWriter() {
236 return writer;
237 }
238
239 public void setWriter(BufferedWriter writer) throws MaltChainedException {
240 close();
241 this.writer = writer;
242 }
243
244 public DataFormatInstance getDataFormatInstance() {
245 return dataFormatInstance;
246 }
247
248 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
249 this.dataFormatInstance = dataFormatInstance;
250 inputColumns = dataFormatInstance.getInputColumnDescriptions();
251 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
252 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
253 }
254
255 public String getOptions() {
256 return optionString;
257 }
258
259 public void setOptions(String optionString) throws MaltChainedException {
260 this.optionString = optionString;
261 format = PennWriterFormat.DEFAULT;
262
263 String[] argv;
264 try {
265 argv = optionString.split("[_\\p{Blank}]");
266 } catch (PatternSyntaxException e) {
267 throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e);
268 }
269 for (int i=0; i < argv.length-1; i++) {
270 if(argv[i].charAt(0) != '-') {
271 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
272 }
273 if(++i>=argv.length) {
274 throw new DataFormatException("The last argument does not have any value. ");
275 }
276 switch(argv[i-1].charAt(1)) {
277 case 'f':
278 if (argv[i].equals("p")) {
279 format = PennWriterFormat.PRETTY;
280 } else if (argv[i].equals("p")) {
281 format = PennWriterFormat.DEFAULT;
282 }
283 break;
284 default:
285 throw new LibsvmException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
286 }
287 }
288 }
289
290 public void close() throws MaltChainedException {
291 try {
292 if (writer != null) {
293 writer.flush();
294 if (closeStream) {
295 writer.close();
296 }
297 writer = null;
298 }
299 } catch (IOException e) {
300 throw new DataFormatException("Could not close the output file. ", e);
301 }
302 }
303
304 private String encodeString(String string) {
305 return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-");
306 }
307 }