001 package org.maltparser.core.symbol.trie;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.FileInputStream;
006 import java.io.FileNotFoundException;
007 import java.io.FileOutputStream;
008 import java.io.UnsupportedEncodingException;
009
010 import java.io.IOException;
011 import java.io.InputStreamReader;
012 import java.io.OutputStreamWriter;
013 import java.util.HashMap;
014 import java.util.Set;
015 import java.util.regex.Pattern;
016 import java.util.regex.PatternSyntaxException;
017
018 import org.apache.log4j.Logger;
019
020 import org.maltparser.core.exception.MaltChainedException;
021 import org.maltparser.core.symbol.SymbolException;
022 import org.maltparser.core.symbol.SymbolTable;
023 import org.maltparser.core.symbol.SymbolTableHandler;
024
025
026 /**
027
028 @author Johan Hall
029 @since 1.0
030 */
031 public class TrieSymbolTableHandler implements SymbolTableHandler {
032 private Trie trie;
033 private HashMap<String, TrieSymbolTable> symbolTables;
034
035 public TrieSymbolTableHandler() {
036 trie = new Trie();
037 symbolTables = new HashMap<String, TrieSymbolTable>();
038 }
039
040 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException {
041 TrieSymbolTable symbolTable = symbolTables.get(tableName);
042 if (symbolTable == null) {
043 symbolTable = new TrieSymbolTable(tableName, trie);
044 symbolTables.put(tableName, symbolTable);
045 }
046 return symbolTable;
047 }
048
049 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException {
050 TrieSymbolTable symbolTable = symbolTables.get(tableName);
051 if (symbolTable == null) {
052 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable;
053 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy());
054 symbolTables.put(tableName, symbolTable);
055 }
056 return symbolTable;
057 }
058
059 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException {
060 TrieSymbolTable symbolTable = symbolTables.get(tableName);
061 if (symbolTable == null) {
062 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy);
063 symbolTables.put(tableName, symbolTable);
064 }
065 return symbolTable;
066 }
067
068 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy, String rootLabel) throws MaltChainedException {
069 TrieSymbolTable symbolTable = symbolTables.get(tableName);
070 if (symbolTable == null) {
071 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy, rootLabel);
072 symbolTables.put(tableName, symbolTable);
073 }
074 return symbolTable;
075 }
076
077 public TrieSymbolTable getSymbolTable(String tableName) {
078 return symbolTables.get(tableName);
079 }
080
081 public Set<String> getSymbolTableNames() {
082 return symbolTables.keySet();
083 }
084
085 public void save(OutputStreamWriter osw) throws MaltChainedException {
086 try {
087 BufferedWriter bout = new BufferedWriter(osw);
088 for (TrieSymbolTable table : symbolTables.values()) {
089 table.saveHeader(bout);
090 }
091 bout.write('\n');
092 for (TrieSymbolTable table : symbolTables.values()) {
093 table.save(bout);
094 }
095 bout.close();
096 } catch (IOException e) {
097 throw new SymbolException("Could not save the symbol tables. ", e);
098 }
099 }
100
101 public void save(String fileName, String charSet) throws MaltChainedException {
102 try {
103 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet));
104 } catch (FileNotFoundException e) {
105 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e);
106 } catch (UnsupportedEncodingException e) {
107 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
108 }
109 }
110
111 public void loadHeader(BufferedReader bin) throws MaltChainedException {
112 String fileLine = "";
113 Pattern tabPattern = Pattern.compile("\t");
114 try {
115 while ((fileLine = bin.readLine()) != null) {
116 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') {
117 break;
118 }
119 String items[];
120 try {
121 items = tabPattern.split(fileLine.substring(1));
122 } catch (PatternSyntaxException e) {
123 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e);
124 }
125 if (items.length != 4) {
126 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. ");
127 }
128 if (items[3].equals("#DUMMY#")) {
129 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]);
130 } else {
131 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2], items[3]);
132 }
133 }
134 } catch (NumberFormatException e) {
135 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e);
136 } catch (IOException e) {
137 throw new SymbolException("Could not load the symbol table. ", e);
138 }
139 }
140
141
142 public void load(InputStreamReader isr) throws MaltChainedException {
143 try {
144 BufferedReader bin = new BufferedReader(isr);
145 String fileLine;
146 SymbolTable table = null;
147 bin.mark(2);
148 if (bin.read() == '\t') {
149 bin.reset();
150 loadHeader(bin);
151 } else {
152 bin.reset();
153 }
154 while ((fileLine = bin.readLine()) != null) {
155 if (fileLine.length() > 0) {
156 table = addSymbolTable(fileLine);
157 table.load(bin);
158 }
159 }
160 bin.close();
161 } catch (IOException e) {
162 throw new SymbolException("Could not load the symbol tables. ", e);
163 }
164 }
165
166 public void load(String fileName, String charSet) throws MaltChainedException {
167 try {
168 load(new InputStreamReader(new FileInputStream(fileName), charSet));
169
170 } catch (FileNotFoundException e) {
171 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e);
172 } catch (UnsupportedEncodingException e) {
173 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
174 }
175 }
176
177
178 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException {
179 try {
180 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet));
181 String fileLine;
182 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy);
183
184 while ((fileLine = br.readLine()) != null) {
185 table.addSymbol(fileLine.trim());
186 }
187 return table;
188 } catch (FileNotFoundException e) {
189 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e);
190 } catch (UnsupportedEncodingException e) {
191 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
192 } catch (IOException e) {
193 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e);
194 }
195 }
196
197 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy, String rootLabel) throws MaltChainedException {
198 try {
199 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet));
200 String fileLine;
201 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy, rootLabel);
202
203 while ((fileLine = br.readLine()) != null) {
204 table.addSymbol(fileLine.trim());
205 }
206 return table;
207 } catch (FileNotFoundException e) {
208 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e);
209 } catch (UnsupportedEncodingException e) {
210 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
211 } catch (IOException e) {
212 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e);
213 }
214 }
215
216 public void printSymbolTables(Logger logger) throws MaltChainedException {
217 for (TrieSymbolTable table : symbolTables.values()) {
218 table.printSymbolTable(logger);
219 }
220 }
221 }