001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.SortedMap;
011 import java.util.TreeMap;
012 import java.util.regex.PatternSyntaxException;
013
014 import org.maltparser.core.exception.MaltChainedException;
015
016 import org.maltparser.core.helper.Util;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.PhraseStructure;
021 import org.maltparser.core.syntaxgraph.TokenStructure;
022 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
023 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024 import org.maltparser.core.syntaxgraph.node.TokenNode;
025 import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
026 import org.maltparser.ml.libsvm.LibsvmException;
027 /**
028 *
029 *
030 * @author Johan Hall
031 */
032 public class TigerXMLWriter implements SyntaxGraphWriter {
033 private enum RootHandling {
034 TALBANKEN, NORMAL
035 };
036
037 private BufferedWriter writer;
038 private DataFormatInstance dataFormatInstance;
039 private String optionString;
040 private int sentenceCount;
041 private TigerXMLHeader header;
042 // private boolean hasWriteTigerXMLHeader = false;
043 private RootHandling rootHandling;
044 private String sentencePrefix = "s";
045 private StringBuilder sentenceID;
046 private StringBuilder tmpID;
047 private StringBuilder rootID;
048 private int START_ID_OF_NONTERMINALS = 500;
049 private boolean labeledTerminalID;
050 private String VROOT_SYMBOL = "VROOT";
051 private boolean useVROOT = false;
052 // private String fileName = null;
053 // private String charsetName = null;
054 private boolean closeStream = true;
055
056 public TigerXMLWriter() {
057 sentenceID = new StringBuilder();
058 tmpID = new StringBuilder();
059 rootID = new StringBuilder();
060 labeledTerminalID = false;
061 }
062
063 public void open(String fileName, String charsetName) throws MaltChainedException {
064 try {
065 // this.fileName = fileName;
066 // this.charsetName = charsetName;
067 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
068 } catch (FileNotFoundException e) {
069 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
070 } catch (UnsupportedEncodingException e) {
071 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
072 }
073 }
074
075 public void open(OutputStream os, String charsetName) throws MaltChainedException {
076 try {
077 if (os == System.out || os == System.err) {
078 closeStream = false;
079 }
080 open(new OutputStreamWriter(os, charsetName));
081 } catch (UnsupportedEncodingException e) {
082 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
083 }
084 }
085
086 private void open(OutputStreamWriter osw) throws MaltChainedException {
087 setWriter(new BufferedWriter(osw));
088 setSentenceCount(0);
089 }
090
091 public void writeProlog() throws MaltChainedException {
092 // if (fileName == null || charsetName == null) {
093 writeHeader();
094 // }
095 }
096
097 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
098 if (syntaxGraph == null || dataFormatInstance == null) {
099 return;
100 }
101 if (syntaxGraph.hasTokens()) {
102 sentenceCount++;
103 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
104 try {
105 sentenceID.setLength(0);
106 sentenceID.append(sentencePrefix);
107 if (phraseStructure.getSentenceID() != 0) {
108 sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
109 } else {
110 sentenceID.append(Integer.toString(sentenceCount));
111 }
112 writer.write(" <s id=\"");
113 writer.write(sentenceID.toString());
114 writer.write("\">\n");
115
116 setRootID(phraseStructure);
117 writer.write(" <graph root=\"");
118 writer.write(rootID.toString());
119 writer.write("\" ");
120 writer.write("discontinuous=\"");
121 writer.write(Boolean.toString(!phraseStructure.isContinuous()));
122 writer.write("\">\n");
123
124 writeTerminals(phraseStructure);
125 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
126 writeNonTerminals(phraseStructure);
127 } else {
128 writer.write(" <nonterminals/>\n");
129 }
130 writer.write(" </graph>\n");
131 writer.write(" </s>\n");
132 } catch (IOException e) {
133 throw new DataFormatException("The TigerXML writer could not write to file. ", e);
134 }
135 }
136 }
137
138 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
139 useVROOT = false;
140 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
141 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
142 if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
143 useVROOT = true;
144 break;
145 }
146 }
147 if (useVROOT) {
148 rootID.setLength(0);
149 rootID.append(sentenceID);
150 rootID.append('_');
151 rootID.append(VROOT_SYMBOL);
152 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
153 rootID.setLength(0);
154 rootID.append(sentenceID);
155 rootID.append("_1");
156 } else {
157 rootID.setLength(0);
158 rootID.append(sentenceID);
159 rootID.append('_');
160 // if (rootHandling.equals(RootHandling.NORMAL)) {
161 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
162 // } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
163 // rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
164 // }
165 }
166
167 }
168
169 public void writeEpilog() throws MaltChainedException {
170 writeTail();
171 }
172
173 public BufferedWriter getWriter() {
174 return writer;
175 }
176
177 public void setWriter(BufferedWriter writer) {
178 this.writer = writer;
179 }
180
181 public void close() throws MaltChainedException {
182 try {
183 if (writer != null) {
184 writer.flush();
185 if (closeStream) {
186 writer.close();
187 }
188 writer = null;
189 }
190 } catch (IOException e) {
191 throw new DataFormatException("Could not close the output file. ", e);
192 }
193 }
194
195 private void writeHeader() throws MaltChainedException {
196 try {
197 if (header == null) {
198 header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
199 }
200 writer.write(header.toTigerXML());
201 // hasWriteTigerXMLHeader = true;
202 } catch (IOException e) {
203 throw new DataFormatException("The TigerXML writer could not write to file. ", e);
204 }
205 }
206
207
208 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
209 try {
210 writer.write(" <terminals>\n");
211 for (int index : phraseStructure.getTokenIndices()) {
212 final PhraseStructureNode t = phraseStructure.getTokenNode(index);
213 writer.write(" <t ");
214 if (!labeledTerminalID) {
215 tmpID.setLength(0);
216 tmpID.append(sentenceID);
217 tmpID.append('_');
218 tmpID.append(Integer.toString(t.getIndex()));
219 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
220 }
221
222 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
223 writer.write(column.getName().toLowerCase());
224 writer.write("=\"");
225 writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
226 writer.write("\" ");
227 }
228 writer.write("/>\n");
229 }
230 writer.write(" </terminals>\n");
231 } catch (IOException e) {
232 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
233 }
234 }
235
236 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
237 try {
238 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
239 for (int index : phraseStructure.getNonTerminalIndices()) {
240 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
241 }
242 writer.write(" <nonterminals>\n");
243 boolean done = false;
244 int h = 1;
245 while (!done) {
246 done = true;
247 for (int index : phraseStructure.getNonTerminalIndices()) {
248 if (heights.get(index) == h) {
249 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
250 tmpID.setLength(0);
251 tmpID.append(sentenceID);
252 tmpID.append('_');
253 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
254 writeNonTerminal(nt, tmpID.toString());
255 done = false;
256 }
257 }
258 h++;
259 }
260
261 writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
262 writer.write(" </nonterminals>\n");
263 } catch (IOException e) {
264 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
265 }
266 }
267
268 public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
269 try {
270 writer.write(" <nt");
271 writer.write(" id=\"");writer.write(id);writer.write("\" ");
272 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
273 if (nt.hasLabel(column.getSymbolTable())) {
274 writer.write(column.getName().toLowerCase());
275 writer.write("=");
276 writer.write("\"");
277 writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
278 writer.write("\" ");
279 }
280 }
281 writer.write(">\n");
282
283 for (int i = 0, n = nt.nChildren(); i < n; i++) {
284 PhraseStructureNode child = nt.getChild(i);
285 writer.write(" <edge ");
286
287 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
288 if (child.hasParentEdgeLabel(column.getSymbolTable())) {
289 writer.write(column.getName().toLowerCase());
290 writer.write("=\"");
291 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
292 writer.write("\" ");
293 }
294 }
295 if (child instanceof TokenNode) {
296 if (!labeledTerminalID) {
297 tmpID.setLength(0);
298 tmpID.append(sentenceID);
299 tmpID.append('_');
300 tmpID.append(Integer.toString(child.getIndex()));
301 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
302 } else {
303 writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
304 }
305
306 } else {
307 tmpID.setLength(0);
308 tmpID.append(sentenceID);
309 tmpID.append('_');
310 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
311 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
312 }
313 writer.write(" />\n");
314 }
315 writer.write(" </nt>\n");
316 } catch (IOException e) {
317 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
318 }
319 }
320
321
322 private void writeTail() throws MaltChainedException {
323 try {
324 writer.write(" </body>\n");
325 writer.write("</corpus>\n");
326 writer.flush();
327 // if (fileName != null && charsetName != null) {
328 // writer.close();
329 // writer = null;
330 // BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
331 // if (header == null) {
332 // header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
333 // }
334 //
335 // headerWriter.write(header.toTigerXML());
336 // headerWriter.flush();
337 // headerWriter.close();
338 // }
339 } catch (IOException e) {
340 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
341 }
342 }
343
344 public int getSentenceCount() {
345 return sentenceCount;
346 }
347
348 public void setSentenceCount(int sentenceCount) {
349 this.sentenceCount = sentenceCount;
350 }
351
352 public DataFormatInstance getDataFormatInstance() {
353 return dataFormatInstance;
354 }
355
356 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
357 this.dataFormatInstance = dataFormatInstance;
358 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
359 }
360
361 public String getOptions() {
362 return optionString;
363 }
364
365 public void setOptions(String optionString) throws MaltChainedException {
366 this.optionString = optionString;
367 rootHandling = RootHandling.NORMAL;
368
369 String[] argv;
370 try {
371 argv = optionString.split("[_\\p{Blank}]");
372 } catch (PatternSyntaxException e) {
373 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
374 }
375 for (int i=0; i < argv.length-1; i++) {
376 if(argv[i].charAt(0) != '-') {
377 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
378 }
379 if(++i>=argv.length) {
380 throw new DataFormatException("The last argument does not have any value. ");
381 }
382 switch(argv[i-1].charAt(1)) {
383 case 'r':
384 if (argv[i].equals("n")) {
385 rootHandling = RootHandling.NORMAL;
386 } else if (argv[i].equals("tal")) {
387 rootHandling = RootHandling.TALBANKEN;
388 }
389 break;
390 case 's':
391 try {
392 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
393 } catch (NumberFormatException e){
394 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
395 }
396 break;
397 case 'v':
398 VROOT_SYMBOL = argv[i];
399 break;
400 default:
401 throw new LibsvmException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
402 }
403 }
404 }
405 }