LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1/***************************************************************************
2 ofx_preproc.cpp
3 -------------------
4 copyright : (C) 2002 by Benoit Gr�oir
5 email : benoitg@coeus.ca
6***************************************************************************/
12/***************************************************************************
13 * *
14 * This program is free software; you can redistribute it and/or modify *
15 * it under the terms of the GNU General Public License as published by *
16 * the Free Software Foundation; either version 2 of the License, or *
17 * (at your option) any later version. *
18 * *
19 ***************************************************************************/
20#include "../config.h"
21#include <iostream>
22#include <fstream>
23#include <cstdlib>
24#include <stdio.h>
25#include <sstream>
26#include <string>
27#include "ParserEventGeneratorKit.h"
28#include "libofx.h"
29#include "messages.hh"
30#include "ofx_sgml.hh"
31#include "ofc_sgml.hh"
32#include "ofx_preproc.hh"
33#include "ofx_utilities.hh"
34#ifdef HAVE_ICONV
35#include <iconv.h>
36#endif
37
38#ifdef __WIN32__
39# define DIRSEP "\\"
40#else
41# define DIRSEP "/"
42#endif
43
44#ifdef __WIN32__
45# include "win32.hh"
46# include <windows.h> // for GetModuleFileName()
47# undef ERROR
48# undef DELETE
49#endif
50
51#define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52#define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53
54using namespace std;
58#ifdef MAKEFILE_DTD_PATH
59const int DTD_SEARCH_PATH_NUM = 4;
60#else
61const int DTD_SEARCH_PATH_NUM = 3;
62#endif
63
68{
69#ifdef MAKEFILE_DTD_PATH
70 MAKEFILE_DTD_PATH,
71#endif
72 "/usr/local/share/libofx/dtd",
73 "/usr/share/libofx/dtd",
74 "~"
75};
76
81int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
82{
83 LibofxContext *libofx_context;
84 bool ofx_start = false;
85 bool ofx_end = false;
86 bool file_is_xml = false;
87 bool used_iconv = false;
88 ifstream input_file;
89 ofstream tmp_file;
90 char *filenames[3];
91 char tmp_filename[256];
92 int tmp_file_fd;
93#ifdef HAVE_ICONV
94 iconv_t conversion_descriptor;
95#endif
96 libofx_context = (LibofxContext*)ctx;
97
98 if (p_filename != NULL && strcmp(p_filename, "") != 0)
99 {
100 message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
101
102 input_file.open(p_filename);
103 if (!input_file)
104 {
105 message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
106 }
107
108 mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
109
110 message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
111#ifdef __WIN32__
112 tmp_file_fd = mkstemp_win32(tmp_filename);
113#else
114 tmp_file_fd = mkstemp(tmp_filename);
115#endif
116 if (tmp_file_fd)
117 {
118 tmp_file.open(tmp_filename);
119 if (!tmp_file)
120 {
121 message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
122 return -1;
123 }
124 }
125 else
126 {
127 message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
128 return -1;
129 }
130
131 if (input_file && tmp_file)
132 {
133 std::size_t header_separator_idx;
134 string header_name;
135 string header_value;
136 string ofx_encoding;
137 string ofx_charset;
138 do
139 {
140 stringbuf buffer;
141 string s_buffer;
142 input_file.get(buffer, '\n');
143 //cout<< "got: \"" << buffer<<"\"\n";
144 s_buffer = buffer.str();
145
146 // Watch out: If input_file is in eof(), any subsequent read or
147 // peek() will fail and we must exit this loop.
148 if (!input_file.eof())
149 {
150 //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
151 if (input_file.fail()) // If no characters were extracted above, the failbit is set.
152 {
153 // No characters extracted means that we've reached the newline
154 // delimiter (because we already checked for EOF). We will check
155 // for and remove that newline in the next if-clause, but must
156 // remove the failbit so that peek() will work again.
157 input_file.clear();
158 }
159
160 // Is the next character really the newline?
161 if (input_file.peek() == '\n')
162 {
163 // Yes. Then discard that newline character from the stream
164 input_file.get();
165 }
166 }
167
168 if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
169 {
170 message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
171 file_is_xml = true;
172 }
173
174 std::size_t ofx_start_idx;
175 if (ofx_start == false)
176 {
177 if (
178 (libofx_context->currentFileType() == OFX &&
179 ((ofx_start_idx = s_buffer.find("<OFX>")) != string::npos ||
180 (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
181 ||
182 (libofx_context->currentFileType() == OFC &&
183 ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
184 (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
185 )
186 {
187 ofx_start = true;
188 if (file_is_xml == false)
189 {
190 s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
191 }
192 message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
193
194 if (file_is_xml == true)
195 {
196 static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
197 if (putenv(sp_charset_fixed) != 0)
198 {
199 message_out(ERROR, "ofx_proc_file(): putenv failed");
200 }
201 /* Normally the following would be "xml".
202 * Unfortunately, opensp's generic api will garble UTF-8 if this is
203 * set to xml. So we set any single byte encoding to avoid messing
204 * up UTF-8. Unfortunately this means that non-UTF-8 files will not
205 * get properly translated. We'd need to manually detect the
206 * encoding in the XML header and convert the xml with iconv like we
207 * do for SGML to work around the problem. Most unfortunate. */
208 static char sp_encoding[] = "SP_ENCODING=ms-dos";
209 if (putenv(sp_encoding) != 0)
210 {
211 message_out(ERROR, "ofx_proc_file(): putenv failed");
212 }
213 }
214 else
215 {
216 static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
217 if (putenv(sp_charset_fixed) != 0)
218 {
219 message_out(ERROR, "ofx_proc_file(): putenv failed");
220 }
221 static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
222 if (putenv(sp_encoding) != 0)
223 {
224 message_out(ERROR, "ofx_proc_file(): putenv failed");
225 }
226#ifdef HAVE_ICONV
227 string fromcode;
228 string tocode;
229 if (ofx_encoding.compare("USASCII") == 0)
230 {
231 if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
232 {
233 //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
234 fromcode = "ISO-8859-1";
235 }
236 else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
237 {
238 //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
239 fromcode = "CP1252";
240 }
241 else if (ofx_charset.compare("NONE") == 0)
242 {
243 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
244 }
245 else
246 {
247 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
248 }
249 }
250 else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
251 {
252 //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
253 fromcode = "UTF-8";
254 }
255 else
256 {
257 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
258 }
259 tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
260 message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
261 conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
262 used_iconv = true;
263#endif
264 }
265 }
266 else
267 {
268 //We are still in the headers
269 if ((header_separator_idx = s_buffer.find(':')) != string::npos)
270 {
271 //Header processing
272 header_name.assign(s_buffer.substr(0, header_separator_idx));
273 header_value.assign(s_buffer.substr(header_separator_idx + 1));
274 while ( header_value[header_value.length() - 1 ] == '\n' ||
275 header_value[header_value.length() - 1 ] == '\r' )
276 header_value.erase(header_value.length() - 1);
277 message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
278 if (header_name.compare("ENCODING") == 0)
279 {
280 ofx_encoding.assign(header_value);
281 }
282 if (header_name.compare("CHARSET") == 0)
283 {
284 ofx_charset.assign(header_value);
285 }
286 }
287 }
288 }
289
290 if (file_is_xml == true || (ofx_start == true && ofx_end == false))
291 {
292 if (ofx_start == true)
293 {
294 /* The above test won't help us if the <OFX> tag is on the same line
295 * as the xml header, but as opensp can't be used to parse it anyway
296 * this isn't a great loss for now.
297 */
298 s_buffer = sanitize_proprietary_tags(s_buffer);
299 if (s_buffer.empty())
300 continue;
301 }
302 //cout<< s_buffer<<"\n";
303 if (file_is_xml == false)
304 {
305#ifdef HAVE_ICONV
306 size_t inbytesleft = s_buffer.size();
307 size_t outbytesleft = inbytesleft * 2 - 1;
308 char * iconv_buffer = (char*) malloc (inbytesleft * 2);
309 memset(iconv_buffer, 0, inbytesleft * 2);
310 const char* inchar = s_buffer.c_str();
311 char * outchar = iconv_buffer;
312 int iconv_retval = iconv (conversion_descriptor,
313 const_cast<char**>(&inchar), &inbytesleft,
314 &outchar, &outbytesleft);
315 if (iconv_retval == -1)
316 {
317 message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
318 }
319 // All validly converted bytes will be copied to the
320 // original buffer
321 s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
322 free (iconv_buffer);
323#endif
324 }
325 //cout << s_buffer << "\n";
326 tmp_file << s_buffer << endl;
327 }
328
329 if (ofx_start == true &&
330 (
331 (libofx_context->currentFileType() == OFX &&
332 ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
333 (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
334 || (libofx_context->currentFileType() == OFC &&
335 ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
336 (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
337 )
338 )
339 {
340 ofx_end = true;
341 message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
342 }
343
344 }
345 while (!input_file.eof() && !input_file.bad());
346 }
347 input_file.close();
348 tmp_file.close();
349#ifdef HAVE_ICONV
350 if (used_iconv == true)
351 {
352 iconv_close(conversion_descriptor);
353 }
354#endif
355 char filename_openspdtd[255];
356 char filename_dtd[255];
357 char filename_ofx[255];
358 STRNCPY(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME)); //The opensp sgml dtd file
359 if (libofx_context->currentFileType() == OFX)
360 {
361 STRNCPY(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME)); //The ofx dtd file
362 }
363 else if (libofx_context->currentFileType() == OFC)
364 {
365 STRNCPY(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME)); //The ofc dtd file
366 }
367 else
368 {
369 message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
370 }
371
372 if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
373 {
374 strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
375 filenames[0] = filename_openspdtd;
376 filenames[1] = filename_dtd;
377 filenames[2] = filename_ofx;
378 int rv;
379 if (libofx_context->currentFileType() == OFX)
380 {
381 rv = ofx_proc_sgml(libofx_context, 3, filenames);
382 }
383 else if (libofx_context->currentFileType() == OFC)
384 {
385 rv = ofc_proc_sgml(libofx_context, 3, filenames);
386 }
387 else
388 {
389 message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
390 rv = -1;
391 }
392 if (remove(tmp_filename) != 0)
393 {
394 message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
395 }
396 return rv;
397 }
398 else
399 {
400 message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
401 return -1;
402 }
403 }
404 else
405 {
406 message_out(ERROR, "ofx_proc_file():No input file specified");
407 return -1;
408 }
409 return 0;
410}
411
412/* Searches input string for an opening or closing tag starting from pos_start.
413 * If found will return the tag_name and pos_start will be set to the string
414 * of the starting <, pos_end to the position after the closing '>'
415 * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
416 */
417static string find_tag_open (string& input_string, size_t& pos_start, size_t& pos_end)
418{
419 pos_start = input_string.find ('<', pos_start);
420
421 if (pos_start == string::npos)
422 {
423 pos_end = string::npos;
424 return string();
425 }
426
427 pos_end = input_string.find ('>', pos_start + 1);
428 if (pos_end != string::npos)
429 pos_end = pos_end + 1;
430 size_t tag_size = (pos_end - 1) - (pos_start + 1);
431 return input_string.substr(pos_start + 1, tag_size);
432}
433
434/* Searches input string for a closing tag matching tag_name starting at pos.
435 * If found pos will be set to the position right after of the closing '>'
436 * If no matching closing tag is found pos will be set to the start of the next
437 * opening or closing tag found.
438 */
439static void find_tag_close (string& input_string, string& tag_name, size_t& pos)
440{
441 size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
442
443 if (start_idx == string::npos)
444 {
445 start_idx = pos;
446 size_t end_idx;
447 string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
448 if (!new_tag_name.empty())
449 {
450 message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
451 // find_tag_open returns the *end* of an opening tag, but in this
452 // case we want its start, so we need to rewind a bit..
453 pos = start_idx;
454 //printf("find_tag_close() returning pos after fallback: %d\n",pos);
455 }
456 else
457 {
458 pos = input_string.length();
459 }
460 }
461 else
462 {
463 pos = start_idx + tag_name.length() + 3;
464 }
465 return;
466}
467
468
480string sanitize_proprietary_tags(string input_string)
481{
482 size_t last_known_good_pos = 0;
483 size_t open_tag_start_pos = last_known_good_pos;
484 size_t open_tag_end_pos;
485 size_t close_tag_end_pos;
486
487 string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
488 while (!tag_name.empty())
489 {
490 // Determine whether the current tag is proprietary.
491 if ((tag_name.find('.') != string::npos) || // tag has a . in the name
492 (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
493 {
494 close_tag_end_pos = open_tag_end_pos;
495 find_tag_close (input_string, tag_name, close_tag_end_pos);
496 size_t tag_size = close_tag_end_pos - open_tag_start_pos;
497 string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
498 message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
499 input_string.erase(open_tag_start_pos, tag_size);
500 last_known_good_pos = open_tag_start_pos;
501 }
502 else
503 {
504 last_known_good_pos = open_tag_end_pos;
505 }
506 tag_name.clear();
507 open_tag_start_pos = last_known_good_pos;
508 if (last_known_good_pos != string::npos)
509 tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
510 }
511 return input_string;
512}
513
514
515#ifdef __WIN32__
516static std::string get_dtd_installation_directory()
517{
518 // Partial implementation of
519 // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
520 char ch_fn[MAX_PATH], *p;
521 std::string str_fn;
522
523 if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
524
525 if ((p = strrchr(ch_fn, '\\')) != NULL)
526 * p = '\0';
527
528 p = strrchr(ch_fn, '\\');
529 if (p && (_stricmp(p + 1, "bin") == 0 ||
530 _stricmp(p + 1, "lib") == 0))
531 *p = '\0';
532
533 str_fn = ch_fn;
534 str_fn += "\\share\\libofx\\dtd";
535
536 return str_fn;
537}
538#endif
539
540
553std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
554{
555 string dtd_path_filename;
556 char *env_dtd_path;
557
558 dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
559 if (!dtd_path_filename.empty())
560 {
561 dtd_path_filename.append(dtd_filename);
562 ifstream dtd_file(dtd_path_filename.c_str());
563 if (dtd_file)
564 {
565 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
566 return dtd_path_filename;
567 }
568 }
569
570#ifdef __WIN32__
571 dtd_path_filename = get_dtd_installation_directory();
572 if (!dtd_path_filename.empty())
573 {
574 dtd_path_filename.append(DIRSEP);
575 dtd_path_filename.append(dtd_filename);
576 ifstream dtd_file(dtd_path_filename.c_str());
577 if (dtd_file)
578 {
579 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
580 return dtd_path_filename;
581 }
582 }
583#endif
584 /* Search in environment variable OFX_DTD_PATH */
585 env_dtd_path = getenv("OFX_DTD_PATH");
586 if (env_dtd_path)
587 {
588 dtd_path_filename.append(env_dtd_path);
589 dtd_path_filename.append(DIRSEP);
590 dtd_path_filename.append(dtd_filename);
591 ifstream dtd_file(dtd_path_filename.c_str());
592 if (!dtd_file)
593 {
594 message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
595 }
596 else
597 {
598 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
599 return dtd_path_filename;
600 }
601 }
602
603 for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
604 {
605 dtd_path_filename = DTD_SEARCH_PATH[i];
606 dtd_path_filename.append(DIRSEP);
607 dtd_path_filename.append(dtd_filename);
608 ifstream dtd_file(dtd_path_filename.c_str());
609 if (!dtd_file)
610 {
611 message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
612 }
613 else
614 {
615 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
616 return dtd_path_filename;
617 }
618 }
619
620 /* Last resort, look in source tree relative path (useful for development) */
621 dtd_path_filename = "";
622 dtd_path_filename.append("..");
623 dtd_path_filename.append(DIRSEP);
624 dtd_path_filename.append("dtd");
625 dtd_path_filename.append(DIRSEP);
626 dtd_path_filename.append(dtd_filename);
627 ifstream dtd_file(dtd_path_filename.c_str());
628 if (!dtd_file)
629 {
630 message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
631 }
632 else
633 {
634 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
635 return dtd_path_filename;
636 }
637
638
639 message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
640 return "";
641}
Main header file containing the LibOfx API.
@ OFX
Definition libofx.h:129
@ OFC
Definition libofx.h:130
int message_out(OfxMsgType error_type, const string message)
Message output function.
Definition messages.cpp:61
Message IO functionality.
@ DEBUG
Definition messages.hh:25
@ ERROR
Definition messages.hh:34
@ INFO
Definition messages.hh:32
@ STATUS
Definition messages.hh:31
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition ofc_sgml.cpp:353
OFX/SGML parsing functionality.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
string sanitize_proprietary_tags(string input_string)
Removes proprietary tags and comments.
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Preprocessing of the OFX files before parsing.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition ofx_sgml.cpp:434
OFX/SGML parsing functionality.
Various simple functions for type conversion & al.
void STRNCPY(T &dest, const std::string &src)