scantools  1.0.4
Graphics manipulation with a view towards scanned documents
PDFAWriter.h
1 /*
2  * Copyright © 2016 - 2020 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3  *
4  * This program is free software: you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation, either version 3 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 
19 #ifndef PDFDOCUMENT
20 #define PDFDOCUMENT 1
21 
22 #include <QFuture>
23 #include <QList>
24 #include <QReadWriteLock>
25 #include <QString>
26 
27 #include "HOCRDocument.h"
28 #include "JBIG2Document.h"
29 #include "paperSize.h"
30 #include "resolution.h"
31 
32 
127 class PDFAWriter : public QObject
128 {
129  Q_OBJECT
130  Q_PROPERTY(QString author READ author WRITE setAuthor NOTIFY authorChanged)
131  Q_PROPERTY(QString keywords READ keywords WRITE setKeywords NOTIFY keywordsChanged)
132  Q_PROPERTY(QString subject READ subject WRITE setSubject NOTIFY subjectChanged)
133  Q_PROPERTY(QString title READ title WRITE setTitle NOTIFY titleChanged)
134  Q_PROPERTY(paperSize pageSize READ pageSize WRITE setPageSize NOTIFY pageSizeChanged)
135  Q_PROPERTY(resolution resolutionOverrideHorizontal READ resolutionOverrideHorizontal WRITE setResolutionOverrideHorizontal NOTIFY resolutionOverrideHorizontalChanged)
136  Q_PROPERTY(resolution resolutionOverrideVertical READ resolutionOverrideVertical WRITE setResolutionOverrideVertical NOTIFY resolutionOverrideVerticalChanged)
137  Q_PROPERTY(bool autoOCR READ autoOCR WRITE setAutoOCR NOTIFY autoOCRChanged)
138  Q_PROPERTY(QStringList autoOCRLanguages READ autoOCRLanguages WRITE setAutoOCRLanguages NOTIFY autoOCRLanguagesChanged)
139 
140  public:
147 
168  explicit PDFAWriter(bool bestCompression=false);
169 
173  QString author();
174 
179  void setAuthor(const QString &author);
180 
184  QString keywords();
185 
190  void setKeywords(const QString &keywords);
191 
196  QString subject();
197 
202  void setSubject(const QString &subject);
203 
207  QString title();
208 
213  void setTitle(const QString &title);
214 
220 
225  void setPageSize(const paperSize &size);
226 
232 
238 
246 
252 
260 
274  void setResolutionOverride(resolution horizontal, resolution vertical);
275 
281  {
282  setResolutionOverride(res, res);
283  }
284 
287  {
289  }
290 
294  bool autoOCR();
295 
305  void setAutoOCR(bool autoOCR);
306 
311  QStringList autoOCRLanguages();
312 
330  QString setAutoOCRLanguages(const QStringList &OCRLanguages);
331 
347  void appendToOCRData(const HOCRDocument &doc);
348 
356 
361  void clearOCRData();
362 
396  QString addPages(const QImage &image, QStringList *warnings=0);
397 
417  QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0);
418 
487  QString addPages(const QString &imageFileName, QStringList *warnings=0);
488 
500  operator QByteArray();
501 
502  public slots:
515 
516  signals:
519 
522 
525 
527  void titleChanged();
528 
531 
534 
537 
540 
543 
551  void finished();
552 
565  void progress(qreal percentage);
566 
567  private:
568  // Meta data
569  QString _author, _keywords, _subject, _title;
570 
571  // Paper size
572  paperSize _pageSize;
573 
574  // HOCR Document
575  HOCRDocument userSpecifiedOCRData;
576  QStringList OCRLanguages;
577  bool _autoOCR;
578 
579  // Override resolutions
580  resolution horizontalResolutionOverride;
581  resolution verticalResolutionOverride;
582 
583  // This private method adds a JBIG2 image to the PDF document. It differs from
584  // the generic method addPages() only in the arguments it expects the name of
585  // a JBIG file inestead of an abitrary graphics file.
586  //
587  // The image will be embedded in the PDF without re-encoding. The method does
588  // not check in detail if the file complies with the JBIG2 standard. If
589  // invalid input data is fed into this method, then the resulting PDF file
590  // might possibly not comply to the PDF/A standard.
591  QString addJBIG2(const QString &fileName, QStringList *warnings=0);
592 
593  // This private method adds a JPEG image to the PDF document. It differs from
594  // the generic method addPages() only in the arguments it expects the name of
595  // a JPEG file inestead of an abitrary graphics file.
596  //
597  // The image will be embedded in the PDF without re-encoding. The method does
598  // not check in detail if the file complies with the JPEG standard. If
599  // invalid input data is fed into this method, then the resulting PDF file
600  // might possibly not comply to the PDF/A standard.
601  QString addJPEG(const QString &fileName);
602 
603  // This private method adds a JPEG2000 (ISO/IEC 15444-2) image to the PDF
604  // document. The method expects a JPX or JPF file, and NOT a JP2 file. It
605  // differs from the generic method addPages() only in the arguments. It
606  // expects the name of a JPEG2000 file inestead of an abitrary graphics file.
607  //
608  // The image will be embedded in the PDF without re-encoding. The method does
609  // not check in detail if the file complies with the JPEG standard. If
610  // invalid input data is fed into this method, then the resulting PDF file
611  // might possibly not comply to the PDF/A standard.
612  QString addJPX(const QString &fileName);
613 
614  // This private method adds a TIFF image to the PDF document. The method
615  // exists because QImageReader cannot handle multi-page TIFF files. The method
616  // reads all images contained in the file, and calls addImage() to add them to
617  // the PDF
618  QString addTIFF(const QString &fileName);
619 
620  // This private method is used internally to generate a page containing a
621  // given graphicObject, and optionally a text overlay. This method assumes
622  // that the arguments have been checked and are correct. It also assumes that
623  // the PDFAWriter has been locked for writing.
624  void addGFXPage(quint32 graphicObjectIndex, const imageInfo& bInfo, const QImage& imageForOCR = QImage());
625 
626  // Lock used to provide thread-safety
627  QReadWriteLock lock;
628 
629  // PDF protoObject. This is either a QByteArray or QFuture<QByteArray>.
630  class protoObject {
631  public:
632  // cppcheck-suppress noExplicitConstructor
633  protoObject(QByteArray _data) : data(_data) {
634  ;
635  };
636 
637  // cppcheck-suppress noExplicitConstructor
638  protoObject(QFuture<QByteArray> _future) : future(_future) {
639  ;
640  };
641 
642  inline operator QByteArray() {
643  if (!future.isCanceled()) {
644  data = future.result();
645  future = QFuture<QByteArray>();
646  }
647  return data;
648  };
649 
650  QString description;
651  QByteArray data;
652  QFuture<QByteArray> future;
653  };
654 
655  // List of PDF objects
656  QList<protoObject> objects;
657 
658  // Index of the PDF object in the 'objects' list that contains …
659  quint32 catalogObjectIndex; // … the catalog of the PDF file
660  quint32 metaDataObjectIndex; // … the meta data
661  quint32 infoObjectIndex; // … the info object
662  quint32 pageDirectoryObjectIndex; // … the page directory
663  quint32 colorProfileObjectIndex; // … the color profile
664  quint32 fontObjectIndex; // … the font object itself
665 
666  // Use zopfli compression for bitmap graphics
667  bool bestCompression;
668 
669  // Indices of the PDF page objects in the 'objects' list
670  QList<quint32> pageIndices;
671 
672  // Reads file content into QByteArray
673  static QByteArray readFile(const QString& fileName);
674 
675  // Constructs a page directory object
676  QByteArray generatePageDirectoryObject() const;
677 
678  // Takes data from input, checks is zlib compression actually shrinks the
679  // data, and then generates a stream object, either unencoded or zlib encoded.
680  static QByteArray generateStreamObject(const QByteArray &input);
681 
682  // Returns the index of a font object for Times-Roman. Creates the object, if necessary
683  quint32 getFontObjectIndex();
684 
685  // Assumes that the image is black-and-white, as returned by
686  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
687  // PDF object containing the FAX G4 compressed image.
688  static QByteArray createImageObject_bw_G4(const QImage &image);
689 
690  // Assumes that the image is bitonal, as returned by
691  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
692  // PDF object containing the FAX G4 compressed image.
693  static QByteArray createImageObject_bitonal_G4(const QImage &image);
694 
695  // Assumes that the image is grayscale, as returned by
696  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
697  // PDF object containing the zlib/zopfli compressed image.
698  static QByteArray createImageObject_gray_zlib(const QImage &image, bool bestCompression);
699 
700  // Assumes that the image has an indexed palette, as returned by
701  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
702  // PDF object containing the zlib/zopfli compressed image.
703  static QByteArray createImageObject_indexed_zlib(const QImage &image, bool bestCompression);
704 
705  // Assumes that the image is full color, as returned by
706  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
707  // PDF object containing the zlib/zopfli compressed image.
708  static QByteArray createImageObject_rgb_zlib(const QImage &image, bool bestCompression);
709 
710  // Internal method. The method takes a page content stream and generates a
711  // well-compressed pageContent object, using the textBox to create a text
712  // overlay.
713  static QByteArray completePageContentObject_a(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const HOCRTextBox& textBox);
714 
715  // Internal method. The method takes runs the tesseract OCR engine to create a
716  // HOCRTextBox and then calls completePageContentObject_a
717  static QByteArray completePageContentObject_b(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const QImage& image, const QStringList& OCRLanguages);
718 };
719 
720 #endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Definition: HOCRDocument.h:42
Text box, as defined in an HOCR file.
Definition: HOCRTextBox.h:45
Reads, writes and renders JBIG2 files, and chops them into pieces for inclusion into a PDF document.
Definition: JBIG2Document.h:40
Simple generator for PDF/A-2b compliant documents.
Definition: PDFAWriter.h:128
void setPageSize(const paperSize &size)
Sets page size, effective for future calls of the methods addPage()
void setResolutionOverride(resolution horizontal, resolution vertical)
Sets graphic resolution for future calls of the methods addPage()
void setSubject(const QString &subject)
Set the subject string in the PDF/A meta data.
void setResolutionOverrideVertical(resolution vertical)
Set vertical resolution.
void subjectChanged()
Emitted when subject changes.
QStringList autoOCRLanguages()
List of languages used for OCR.
void waitForWorkerThreads()
Waits for all worker threads to finish.
void setResolutionOverrideHorizontal(resolution horizontal)
Set horizontal resolution.
void pageSizeChanged()
Emitted when pageSize changes.
void resolutionOverrideVerticalChanged()
Emitted when resolutionOverrideVertical changes.
void progress(qreal percentage)
Progress indicator.
QString setAutoOCRLanguages(const QStringList &OCRLanguages)
Specify languages used by the tesseract OCR engine.
void setAutoOCR(bool autoOCR)
Specify if the tesseract OCR engine should be run automatically.
void clearOCRData()
Delete all pages from the internal HOCRDocument.
void resolutionOverrideHorizontalChanged()
Emitted when resolutionOverrideHorizontal changes.
~PDFAWriter()
Destructor.
void autoOCRLanguagesChanged()
Emitted when autoOCRLanguages change.
void setResolutionOverride(resolution res)
Overloaded method that sets horizontal and vertical resolution to the same value.
Definition: PDFAWriter.h:280
paperSize pageSize()
Page Size.
QString title()
Metadata: Title String.
void titleChanged()
Emitted when title changes.
void setPageSize(paperSize::format size=paperSize::empty)
Sets page size, effective for future calls of the methods addPage()
resolution resolutionOverrideHorizontal()
Horizontal resolution.
QString addPages(const QString &imageFileName, QStringList *warnings=0)
Add images to the PDF document.
void setTitle(const QString &title)
Set the title string in the PDF/A meta data.
void finished()
Emitted just before waitForWorkerThreads() returns.
QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0)
Add JBIG2 images to the PDF document.
QString keywords()
Metadata: Keywords.
bool autoOCR()
AutoOCR.
void authorChanged()
Emitted when author changes.
void setAuthor(const QString &author)
Set the author string in the PDF/A meta data.
HOCRDocument OCRData()
Return a copy of the internal HOCRDocument.
PDFAWriter(bool bestCompression=false)
Constructor.
QString addPages(const QImage &image, QStringList *warnings=0)
Add an image to the PDF document.
void autoOCRChanged()
Emitted when autoOCR changes.
QString author()
Metadata: Author.
void keywordsChanged()
Emitted when keywords change.
QString subject()
Metadata: Subject string.
void clearResolutionOverride()
Set horizontal and vertical override resolution to zero.
Definition: PDFAWriter.h:286
resolution resolutionOverrideVertical()
Vertical resolution.
void setKeywords(const QString &keywords)
Set the author string in the PDF/A meta data.
void appendToOCRData(const HOCRDocument &doc)
Specify pre-processed OCR data.
Trivial class to store elementary info about bitmap graphics.
Definition: imageInfo.h:31
The length stores a length and converts between units.
Definition: length.h:38
The paperSize class identifies and stores paper sizes.
Definition: paperSize.h:32
format
List of supported standard sizes.
Definition: paperSize.h:35
@ empty
0x0mm
Definition: paperSize.h:38
The resolution class stores a resolution and converts between units.
Definition: resolution.h:40