scantools  1.0.4
Graphics manipulation with a view towards scanned documents
HOCRDocument.h
1 /*
2  * Copyright © 2016-2018 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3  *
4  * This program is free software: you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation, either version 3 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 
19 #ifndef HOCRDOCUMENT
20 #define HOCRDOCUMENT 1
21 
22 
23 #include "HOCRTextBox.h"
24 #include "resolution.h"
25 #include <QPageSize>
26 #include <QSet>
27 
28 
42 {
43  public:
46 
54  explicit HOCRDocument(QIODevice *device) {read(device);};
55 
63  explicit HOCRDocument(QString fileName) {read(fileName);};
64 
75  HOCRDocument(const QImage &image, QStringList languages=QStringList()) {read(image,languages);};
76 
81  void clear();
82 
88  bool hasError() const {return !_error.isEmpty();};
89 
96  QString error() const {return _error; };
97 
105  bool hasWarnings() const {return !_warnings.isEmpty();};
106 
111  QSet<QString> warnings() const {return _warnings;};
112 
119  QSet<QString> system() const {return _OCRSystem;};
120 
129  QSet<QString> capabilities() const {return _OCRCapabilities;};
130 
136  QList<HOCRTextBox> pages() const {return _pages;};
137 
142  bool isEmpty() const {return _pages.isEmpty();};
143 
144 
152  bool hasText() const;
153 
163  if (_pages.size() > 0)
164  return _pages.takeFirst();
165  else
166  return HOCRTextBox();
167  };
168 
182  void read(QIODevice *device);
183 
191  void read(const QString& fileName);
192 
210  void read(const QImage &image, const QStringList& languages=QStringList());
211 
222  QFont suggestFont() const;
223 
248  QString toPDF(const QString& fileName, resolution _resolution, const QString& title=QString(), const QPageSize& overridePageSize=QPageSize(), QFont *overrideFont=0) const;
249 
264  QList<QImage> toImages(QFont *overrideFont=0, QImage::Format format=QImage::Format_Grayscale8) const;
265 
272  QString toText() const;
273 
281  void append(const HOCRDocument &other);
282 
289  static QStringList tesseractLanguages();
290 
300  static bool areLanguagesSupportedByTesseract(const QStringList& lingos);
301 
302  private:
303  // This is a convenience method that suggests a page size for a given page of
304  // the document, taking resolution and overridePageSize into account. The
305  // reason for the existence of this method is that the computation is needed
306  // in two different places in the method exportToPDF, and I wanted to avoid
307  // duplicated code.
308  QPageSize findPageSize(int pageNumber, resolution _resolution, const QPageSize &overridePageSize) const;
309 
310  // Error
311  QString _error;
312 
313  // System(s) that generated this file, as specified in a meta tag of the HOCR
314  // file
315  QSet<QString> _OCRSystem;
316 
317  // OCR capabilites used in this file, as specified in a meta tag of the HOCR
318  // file
319  QSet<QString> _OCRCapabilities;
320 
321  // Pages of the document
322  QList<HOCRTextBox> _pages;
323 
324  // Warnings
325  QSet<QString> _warnings;
326 };
327 
328 #endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Definition: HOCRDocument.h:42
HOCRDocument(QString fileName)
Constructs an HOCR document from a file.
Definition: HOCRDocument.h:63
HOCRDocument(QIODevice *device)
Constructs an HOCR document from a QIODevice.
Definition: HOCRDocument.h:54
QString toText() const
Export this document as text.
static bool areLanguagesSupportedByTesseract(const QStringList &lingos)
Check if languages are supported by tesseract.
QList< QImage > toImages(QFont *overrideFont=0, QImage::Format format=QImage::Format_Grayscale8) const
Export to images.
void read(const QImage &image, const QStringList &languages=QStringList())
Generates an HOCR document by running the tesseract OCR engine.
void read(QIODevice *device)
Reads an HOCR document from a QIODevice.
bool isEmpty() const
Returns true if the document contains no pages.
Definition: HOCRDocument.h:142
QString error() const
Error message.
Definition: HOCRDocument.h:96
HOCRDocument()
Constructs an empty HOCR document.
Definition: HOCRDocument.h:45
HOCRTextBox takeFirstPage()
Removes the first page of the document and returns it.
Definition: HOCRDocument.h:162
void read(const QString &fileName)
Reads an HOCR document from a file.
QFont suggestFont() const
Suggest font.
QSet< QString > system() const
System(s) that generated this file.
Definition: HOCRDocument.h:119
QSet< QString > warnings() const
Warning messages.
Definition: HOCRDocument.h:111
bool hasText() const
Check if the document does contain text.
HOCRDocument(const QImage &image, QStringList languages=QStringList())
Constructs an HOCR document by running the tesseract OCR engine.
Definition: HOCRDocument.h:75
void append(const HOCRDocument &other)
Appends other HOCRDocument.
static QStringList tesseractLanguages()
List of languages supported by tesseract.
bool hasWarnings() const
Warning status.
Definition: HOCRDocument.h:105
QList< HOCRTextBox > pages() const
Pages in the document.
Definition: HOCRDocument.h:136
bool hasError() const
Error status.
Definition: HOCRDocument.h:88
QString toPDF(const QString &fileName, resolution _resolution, const QString &title=QString(), const QPageSize &overridePageSize=QPageSize(), QFont *overrideFont=0) const
Export to PDF.
void clear()
Resets the document.
QSet< QString > capabilities() const
OCR capabilites.
Definition: HOCRDocument.h:129
Text box, as defined in an HOCR file.
Definition: HOCRTextBox.h:45
The resolution class stores a resolution and converts between units.
Definition: resolution.h:40