scantools  1.0.4
Graphics manipulation with a view towards scanned documents
HOCRTextBox.h
1 /*
2  * Copyright © 2016--2018 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3  *
4  * This program is free software: you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation, either version 3 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 
19 #ifndef HOCRTEXTBOX
20 #define HOCRTEXTBOX 1
21 
22 #include <QPainter>
23 #include <QSet>
24 #include <QXmlStreamReader>
25 
26 #include "resolution.h"
27 
28 class HOCRDocument;
29 
30 
45 {
46  public:
49 
55  bool hasText() const;
56 
62  qreal angle() const {return _angle;};
63 
69  QXmlStreamAttributes attributes() const {return _attributes;};
70 
77  QVector<qreal> baselinePolynomial() const {return _baselinePolynomial;};
78 
85  QPoint baselineReferencePoint() const {return _baselineReferencePoint;};
86 
92  QRect boundingBox() const {return _boundingBox;};
93 
101  QString classType() const;
102 
108  int confidence() const {return _confidence;};
109 
116  QString direction() const {return _direction;};
117 
123  qreal fontSize() const {return _fontSize;};
124 
131  QString imageName() const {return _imageName;};
132 
138  QString language() const {return _language;};
139 
150  void render(QPainter &painter) const;
151 
164  QImage toImage(const QFont &overrideFont, QImage::Format format=QImage::Format_Grayscale8) const;
165 
191  QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX=length(), length deltaY=length()) const;
192 
198  QString toText() const;
199 
213  qint64 estimateFit(const QFont &font) const;
214 
225  QFont suggestFont() const;
226 
232  QString text() const {return _text;};
233 
234  private:
235  // Specifies how and where the textbox' content should be drawn.
236  struct renderingHints {
238  int fontSize;
239 
241  QPoint referencePoint;
242 
244  qreal horizontalStretchFactor;
245  };
246 
247  // For a given font, this method computes rendering hints so that the text
248  // fits the bounding box best.
249  //
250  // - If the HOCR file specifies a font size, this size is taken. Otherwise,
251  // - the font metric is used to find a size that best fits the bounding
252  // - box. As a last resort, a standard value is returned.
253  //
254  // - The font metric is used to suggest a point from which to draw the text of
255  // - this text box, and a horizontal stretch factor
256  renderingHints getRenderingHints(const QFont &font) const;
257 
258  // This is to ensure that text boxes can be constructed from HOCR documents.
259  friend HOCRDocument;
260 
261  // Constructs a textbox by reading in an xml file. This constructor expects
262  // that the QXmlStreamReader points to a start element. It will read the file
263  // until it reaches the corresponding end element. When the method returns,
264  // the QXmlStreamReader points to this end element. In case of problems,
265  // warnings are added to 'warnings'.
266  HOCRTextBox(QXmlStreamReader &xml, QSet<QString> &warnings, HOCRTextBox *parent=0);
267 
268  // Interprets _attributes and fills in the members _baseLine, _boundingBox,
269  // _class, _confidence, _fontSize, _imageName. Problems encoutered in the
270  // interpretation are added to the set 'warnings'. This method is called only
271  // in the constructor. The code is not part of the constructor to keep the
272  // source readable.
273  void interpretAttributes(QSet<QString> &warnings, qint64 line, qint64 column);
274 
275  // Attributes, as read from the HOCR file
276  QXmlStreamAttributes _attributes;
277 
278  // List of sub boxes, as read from the HOCR file
279  QList<HOCRTextBox> _subBoxes;
280 
281  /*
282  * Attributes extracted from the HOCR file
283  */
284 
285  // Textangle, as specified in the HOCR file or inherited from parent. If no
286  // angle was specified, this number is zero.
287  qreal _angle;
288 
289  // Base line as a polynomial, as specified in the HOCR file or inherited from
290  // parent. If no base line was specified, this vector is empty.
291  QVector<qreal> _baselinePolynomial;
292 
293  // Base line reference point, as specified in the HOCR file or inherited from
294  // parent. If no base line polynomial is specified, this member is
295  // meaningless.
296  QPoint _baselineReferencePoint;
297 
298  // Bounding box, as specified in the HOCR file. If no bounding box was
299  // specified, this box is empty.
300  QRect _boundingBox;
301 
302  // Contains the class of the corresponding element in the HOCR file. Typical
303  // values are "ocr_page", "ocr_carea", "ocr_par", "ocr_line" or "ocrx_word".
304  QString _class;
305 
306  // Contains the confidence level of the corresponding element in the HOCR
307  // file. If no confidence level is specified, this member contains '-1'.
308  int _confidence;
309 
310  // Contains the text flow direction of the corresponding element in the HOCR
311  // file. The value 'ltr' means left-to-right, 'rtl' means right-to-left. Any
312  // other value means 'undefined'.
313  QString _direction;
314 
315  // Contains the font size specified in the corresponding element in the HOCR
316  // file. If no font size is specified, this member contains '0.0'.
317  qreal _fontSize;
318 
319  // Contains the name of an image associated with the content of this text
320  // box. If nothing is specified in the HOCR file, this string is empty.
321  QString _imageName;
322 
323  // Language of the content of this text box. If nothing is specified in the
324  // HOCR file, this string is empty.
325  QString _language;
326 
327  // Contains the text of this text box. If nothing is specified in the HOCR
328  // file, this string is empty.
329  QString _text;
330 
331  /*
332  * Helper functions
333  */
334 
335  // Expects a string of the form "blabla int int int …" and returns a vector
336  // containing the integers
337  QVector<int> getIntegers(const QString& spec) const;
338 
339  // Expects a string of the form "blabla qreal qreal qreal …" and returns a
340  // vector containing the qreals
341  QVector<qreal> getFloats(const QString& spec) const;
342 
343  // Trivial method that writes out a floating point number in ASCII, up to four
344  // decimal points of precision. Trailing zeroes are deleted for brevity's
345  // sake. It seems that Qt cannot do that
346  QByteArray toNumber(qreal x) const;
347 
348  // Internal method that actually does the work for the user method with the
349  // same name. This method is applies recursively over all sub-boxes, and the
350  // results are joined. It differs from the user method in that it takes two
351  // additional arguments: 'height' is the height of the bounding box for which
352  // the user method was called; this is necessary for correct text
353  // placement. The argument "currentSize" is the font size last set; this is
354  // used to set and re-set the same sizes times and again. The parameter
355  // 'codec' is a pointer to the "Windows-1252" QTextCodec.
356  QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX, length deltaY, quint16 height, qreal &currentFontSize, QTextCodec *codec) const;
357 };
358 
359 
360 #endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Definition: HOCRDocument.h:42
Text box, as defined in an HOCR file.
Definition: HOCRTextBox.h:45
HOCRTextBox()
Constructs an empty text box.
void render(QPainter &painter) const
Paint the contents of the text box to a painter.
qint64 estimateFit(const QFont &font) const
Estimate how well a given font fits the textbox.
QString classType() const
Class of this textBox.
QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX=length(), length deltaY=length()) const
Return raw PDF text rendering commands.
QString toText() const
Export this text box as text.
qreal angle() const
Text angle.
Definition: HOCRTextBox.h:62
qreal fontSize() const
Font size.
Definition: HOCRTextBox.h:123
QVector< qreal > baselinePolynomial() const
Base line as a polynomial.
Definition: HOCRTextBox.h:77
QPoint baselineReferencePoint() const
Base line reference point.
Definition: HOCRTextBox.h:85
QRect boundingBox() const
Bounding box.
Definition: HOCRTextBox.h:92
QString text() const
Text content of the text box.
Definition: HOCRTextBox.h:232
bool hasText() const
Decide if the text box contains non-trivial text.
QString imageName() const
Image associated with content of this text box.
Definition: HOCRTextBox.h:131
int confidence() const
Confidence level.
Definition: HOCRTextBox.h:108
QXmlStreamAttributes attributes() const
Returns the attributes of the textBox.
Definition: HOCRTextBox.h:69
QFont suggestFont() const
Suggest font.
QImage toImage(const QFont &overrideFont, QImage::Format format=QImage::Format_Grayscale8) const
Export this text box as an image.
QString language() const
Language of the content of this text box.
Definition: HOCRTextBox.h:138
QString direction() const
Text flow direction.
Definition: HOCRTextBox.h:116
The length stores a length and converts between units.
Definition: length.h:38
The resolution class stores a resolution and converts between units.
Definition: resolution.h:40