Leptonica  1.82.0
Image processing and image analysis suite
pdfio1.c File Reference
#include <string.h>
#include <math.h>
#include "allheaders.h"

Go to the source code of this file.

Functions

l_ok convertFilesToPdf (const char *dirname, const char *substr, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, const char *fileout)
 
l_ok saConvertFilesToPdf (SARRAY *sa, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, const char *fileout)
 
l_ok saConvertFilesToPdfData (SARRAY *sa, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok selectDefaultPdfEncoding (PIX *pix, l_int32 *ptype)
 
l_ok convertUnscaledFilesToPdf (const char *dirname, const char *substr, const char *title, const char *fileout)
 
l_ok saConvertUnscaledFilesToPdf (SARRAY *sa, const char *title, const char *fileout)
 
l_ok saConvertUnscaledFilesToPdfData (SARRAY *sa, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok convertUnscaledToPdfData (const char *fname, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok pixaConvertToPdf (PIXA *pixa, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, const char *fileout)
 
l_ok pixaConvertToPdfData (PIXA *pixa, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok convertToPdf (const char *filein, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position)
 
l_ok convertImageDataToPdf (l_uint8 *imdata, size_t size, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position)
 
l_ok convertToPdfData (const char *filein, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position)
 
l_ok convertImageDataToPdfData (l_uint8 *imdata, size_t size, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position)
 
l_ok pixConvertToPdf (PIX *pix, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position)
 
l_ok pixWriteStreamPdf (FILE *fp, PIX *pix, l_int32 res, const char *title)
 
l_ok pixWriteMemPdf (l_uint8 **pdata, size_t *pnbytes, PIX *pix, l_int32 res, const char *title)
 
l_ok convertSegmentedFilesToPdf (const char *dirname, const char *substr, l_int32 res, l_int32 type, l_int32 thresh, BOXAA *baa, l_int32 quality, l_float32 scalefactor, const char *title, const char *fileout)
 
BOXAAconvertNumberedMasksToBoxaa (const char *dirname, const char *substr, l_int32 numpre, l_int32 numpost)
 
l_ok convertToPdfSegmented (const char *filein, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *title, const char *fileout)
 
l_ok pixConvertToPdfSegmented (PIX *pixs, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *title, const char *fileout)
 
l_ok convertToPdfDataSegmented (const char *filein, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok pixConvertToPdfDataSegmented (PIX *pixs, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *title, l_uint8 **pdata, size_t *pnbytes)
 
l_ok concatenatePdf (const char *dirname, const char *substr, const char *fileout)
 
l_ok saConcatenatePdf (SARRAY *sa, const char *fileout)
 
l_ok ptraConcatenatePdf (L_PTRA *pa, const char *fileout)
 
l_ok concatenatePdfToData (const char *dirname, const char *substr, l_uint8 **pdata, size_t *pnbytes)
 
l_ok saConcatenatePdfToData (SARRAY *sa, l_uint8 **pdata, size_t *pnbytes)
 

Variables

static const l_int32 DefaultInputRes = 300
 

Detailed Description

   Higher-level operations for generating pdf from images.
   Use poppler's pdftoppm or pdfimages to invert the process,
   extracting raster images from pdf.
   |=============================================================|
   |                        Important notes                      |
   |=============================================================|
   | Some of these functions require I/O libraries such as       |
   | libtiff, libjpeg, libpng, libz and libopenjp2.  If you do   |
   | not have these libraries, some calls will fail.  For        |
   | example, if you do not have libopenjp2, you cannot write a  |
   | pdf where transcoding is required to incorporate a          |
   | jp2k image.                                                 |
   |                                                             |
   | You can manually deactivate all pdf writing by setting      |
   | this in environ.h:                                          |
   |
|
| #define USE_PDFIO 0 |
|
| | This will link the stub file pdfiostub.c. | |=============================================================|
    Set 1. These functions convert a set of image files
    to a multi-page pdf file, with one image on each page.
    All images are rendered at the same (input) resolution.
    The images can be specified as being in a directory, or they
    can be in an sarray.  The output pdf can be either a file
    or an array of bytes in memory.
    Set 2. These functions are a special case of set 1, where
    no scaling or change in quality is required.  For jpeg, jp2k and
    tiffg4 images, the bytes in each file can be directly incorporated
    into the output pdf, and the wrapping up of multiple image
    files is very fast.  For non-interlaced png, the data bytes
    including the predictors can also be written directly into the
    flate pdf data.  For other image formats transcoding is required,
    where the image data is first decompressed and then flate (gzip),
    DCT (jpeg) or tiffg4 (1 bpp) encodings are generated.
    Set 3. These functions convert a set of images in memory
    to a multi-page pdf, with one image on each page.  The pdf
    output can be either a file or an array of bytes in memory.
    Set 4. These functions implement a pdf output "device driver"
    for wrapping (encoding) any number of images on a single page
    in pdf.  The input can be either an image file or a Pix;
    the pdf output can be either a file or an array of bytes in memory.
    Set 5. These "segmented" functions take a set of image
    files, along with optional segmentation information, and
    generate a multi-page pdf file, where each page consists
    in general of a mixed raster pdf of image and non-image regions.
    The segmentation information for each page can be input as
    either a mask over the image parts, or as a Boxa of those
    regions.
    Set 6. These "segmented" functions convert an image and
    an optional Boxa of image regions into a mixed raster pdf file
    for the page.  The input image can be either a file or a Pix.
    Set 7. These functions take a set of single-page pdf files
    and concatenates it into a multi-page pdf.  The input can be
    a set of either single page pdf files or pdf 'strings' in memory.
    The output can be either a file or an array of bytes in memory.
    The images in the pdf file can be rendered using a pdf viewer,
    such as evince, gv, xpdf or acroread.
    Reference on the pdf file format:
        http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
    1. Convert specified image files to pdf (one image file per page)
         l_int32             convertFilesToPdf()
         l_int32             saConvertFilesToPdf()
         l_int32             saConvertFilesToPdfData()
         l_int32             selectDefaultPdfEncoding()
    2. Convert specified image files to pdf without scaling
         l_int32             convertUnscaledFilesToPdf()
         l_int32             saConvertUnscaledFilesToPdf()
         l_int32             saConvertUnscaledFilesToPdfData()
         l_int32             convertUnscaledToPdfData()
    3. Convert multiple images to pdf (one image per page)
         l_int32             pixaConvertToPdf()
         l_int32             pixaConvertToPdfData()
    4. Single page, multi-image converters
         l_int32             convertToPdf()
         l_int32             convertImageDataToPdf()
         l_int32             convertToPdfData()
         l_int32             convertImageDataToPdfData()
         l_int32             pixConvertToPdf()
         l_int32             pixWriteStreamPdf()
         l_int32             pixWriteMemPdf()
    5. Segmented multi-page, multi-image converter
         l_int32             convertSegmentedFilesToPdf()
         BOXAA              *convertNumberedMasksToBoxaa()
    6. Segmented single page, multi-image converters
         l_int32             convertToPdfSegmented()
         l_int32             pixConvertToPdfSegmented()
         l_int32             convertToPdfDataSegmented()
         l_int32             pixConvertToPdfDataSegmented()
    7. Multipage concatenation
         l_int32             concatenatePdf()
         l_int32             saConcatenatePdf()
         l_int32             ptraConcatenatePdf()
         l_int32             concatenatePdfToData()
         l_int32             saConcatenatePdfToData()
    The top-level multi-image functions can be visualized as follows:
         Output pdf data to file:
            convertToPdf()  and  convertImageDataToPdf()
                    --> pixConvertToPdf()
                          --> pixConvertToPdfData()
         Output pdf data to array in memory:
            convertToPdfData()  and  convertImageDataToPdfData()
                    --> pixConvertToPdfData()
    The top-level segmented image functions can be visualized as follows:
         Output pdf data to file:
            convertToPdfSegmented()
                    --> pixConvertToPdfSegmented()
                          --> pixConvertToPdfDataSegmented()
         Output pdf data to array in memory:
            convertToPdfDataSegmented()
                    --> pixConvertToPdfDataSegmented()
    For multi-page concatenation, there are three different types of input
       (1) directory and optional filename filter
       (2) sarray of filenames
       (3) ptra of byte arrays of pdf data
    and two types of output for the concatenated pdf data
       (1) filename
       (2) data array and size
    High-level interfaces are given for each of the six combinations.
    Note: When wrapping small images into pdf, it is useful to give
    them a relatively low resolution value, to avoid rounding errors
    when rendering the images.  For example, if you want an image
    of width w pixels to be 5 inches wide on a screen, choose a
    resolution w/5.
    The very fast functions in section (2) require neither transcoding
    nor parsing of the compressed jpeg file.  With three types of image
    compression, the compressed strings can be incorporated into
    the pdf data without decompression and re-encoding: jpeg, jp2k
    and png.  The DCTDecode and JPXDecode filters can handle the
    entire jpeg and jp2k encoded string as a byte array in the pdf file.
    The FlateDecode filter can handle the png compressed image data,
    including predictors that occur as the first byte in each
    raster line, but it is necessary to store only the png IDAT chunk
    data in the pdf array.  The alternative for wrapping png images
    is to transcode them: uncompress into a raster (a pix) and then
    gzip the raster data.  This typically results in a larger pdf file
    because it doesn't use the two-dimensional png predictor.
    Colormaps, which are found in png PLTE chunks, must always be
    pulled out and included separately in the pdf.  For CCITT-G4
    compression, you can not simply include a tiff G4 file -- you must
    either parse it and extract the G4 compressed data within it,
    or uncompress to a raster and G4 compress again.

Definition in file pdfio1.c.

Function Documentation

◆ concatenatePdf()

l_ok concatenatePdf ( const char *  dirname,
const char *  substr,
const char *  fileout 
)

concatenatePdf()

Parameters
[in]dirnamedirectory name containing single-page pdf files
[in]substr[optional] substring filter on filenames; can be NULL
[in]fileoutconcatenated pdf file
Returns
0 if OK, 1 on error
Notes:
     (1) This only works with leptonica-formatted single-page pdf files.
     (2) If substr is not NULL, only filenames that contain
         the substring can be returned.  If substr == NULL,
         none of the filenames are filtered out.
     (3) The files in the directory, after optional filtering by
         the substring, are lexically sorted in increasing order
         before concatenation.

Definition at line 2055 of file pdfio1.c.

References getSortedPathnamesInDirectory(), saConcatenatePdf(), and sarrayDestroy().

◆ concatenatePdfToData()

l_ok concatenatePdfToData ( const char *  dirname,
const char *  substr,
l_uint8 **  pdata,
size_t *  pnbytes 
)

concatenatePdfToData()

Parameters
[in]dirnamedirectory name containing single-page pdf files
[in]substr[optional] substring filter on filenames; can be NULL
[out]pdataconcatenated pdf data in memory
[out]pnbytesnumber of bytes in pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) This only works with leptonica-formatted single-page pdf files.
     (2) If substr is not NULL, only filenames that contain
         the substring can be returned.  If substr == NULL,
         none of the filenames are filtered out.
     (3) The files in the directory, after optional filtering by
         the substring, are lexically sorted in increasing order
         before concatenation.

Definition at line 2170 of file pdfio1.c.

References getSortedPathnamesInDirectory(), saConcatenatePdfToData(), and sarrayDestroy().

◆ convertFilesToPdf()

l_ok convertFilesToPdf ( const char *  dirname,
const char *  substr,
l_int32  res,
l_float32  scalefactor,
l_int32  type,
l_int32  quality,
const char *  title,
const char *  fileout 
)

convertFilesToPdf()

Parameters
[in]dirnamedirectory name containing images
[in]substr[optional] substring filter on filenames; can be NULL
[in]resinput resolution of all images
[in]scalefactorscaling factor applied to each image; > 0.0
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE or L_DEFAULT_ENCODE for default)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]title[optional] pdf title; if null, taken from the first image filename
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) If substr is not NULL, only image filenames that contain
         the substring can be used.  If substr == NULL, all files
         in the directory are used.
     (2) The files in the directory, after optional filtering by
         the substring, are lexically sorted in increasing order
         before concatenation.
     (3) The scalefactor is applied to each image before encoding.
         If you enter a value <= 0.0, it will be set to 1.0.
     (4) Specifying one of the four encoding types for type forces
         all images to be compressed with that type.  Use 0 to have
         the type determined for each image based on depth and whether
         or not it has a colormap.

Definition at line 253 of file pdfio1.c.

References getSortedPathnamesInDirectory(), saConvertFilesToPdf(), and sarrayDestroy().

Referenced by pixCompareWithTranslation().

◆ convertImageDataToPdf()

l_ok convertImageDataToPdf ( l_uint8 *  imdata,
size_t  size,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
const char *  title,
L_PDF_DATA **  plpd,
l_int32  position 
)

convertImageDataToPdf()

Parameters
[in]imdataarray of formatted image data; e.g., png, jpeg
[in]sizesize of image data
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, or L_JP2K_ENCODE)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]fileoutoutput pdf file; only required on last image on page
[in]x,ylocation of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]title[optional] pdf title
[in,out]plpdptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed
[in]positionin image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE
Returns
0 if OK, 1 on error
Notes:
     (1) If res == 0 and the input resolution field is 0,
         this will use DefaultInputRes.
     (2) See comments in convertToPdf().

Definition at line 1071 of file pdfio1.c.

References L_FLATE_ENCODE, L_G4_ENCODE, L_JP2K_ENCODE, L_JPEG_ENCODE, L_LAST_IMAGE, pixConvertToPdf(), pixDestroy(), pixReadMem(), and selectDefaultPdfEncoding().

◆ convertImageDataToPdfData()

l_ok convertImageDataToPdfData ( l_uint8 *  imdata,
size_t  size,
l_int32  type,
l_int32  quality,
l_uint8 **  pdata,
size_t *  pnbytes,
l_int32  x,
l_int32  y,
l_int32  res,
const char *  title,
L_PDF_DATA **  plpd,
l_int32  position 
)

convertImageDataToPdfData()

Parameters
[in]imdataarray of formatted image data; e.g., png, jpeg
[in]sizesize of image data
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, or L_JP2K_ENCODE)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[out]pdatapdf data in memory
[out]pnbytesnumber of bytes in pdf data
[in]x,ylocation of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]title[optional] pdf title
[out]plpdptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed
[in]positionin image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE
Returns
0 if OK, 1 on error
Notes:
     (1) If res == 0 and the input resolution field is 0,
         this will use DefaultInputRes.
     (2) See comments in convertToPdf().

Definition at line 1208 of file pdfio1.c.

References L_FIRST_IMAGE, L_FLATE_ENCODE, L_G4_ENCODE, L_JP2K_ENCODE, L_JPEG_ENCODE, pixConvertToPdfData(), pixDestroy(), pixReadMem(), and selectDefaultPdfEncoding().

◆ convertNumberedMasksToBoxaa()

BOXAA* convertNumberedMasksToBoxaa ( const char *  dirname,
const char *  substr,
l_int32  numpre,
l_int32  numpost 
)

convertNumberedMasksToBoxaa()

Parameters
[in]dirnamedirectory name containing mask images
[in]substr[optional] substring filter on filenames; can be NULL
[in]numprenumber of characters in name before number
[in]numpostnumber of characters in name after number, up to a dot before an extension
Returns
boxaa of mask regions, or NULL on error
Notes:
     (1) This is conveniently used to generate the input boxaa
         for convertSegmentedFilesToPdf().  It guarantees that the
         boxa will be aligned with the page images, even if some
         of the boxa are empty.

Definition at line 1588 of file pdfio1.c.

References boxaaCreate(), boxaaInitFull(), boxaaReplaceBoxa(), boxaCreate(), boxaDestroy(), getNumberedPathnamesInDirectory(), L_NOCOPY, pixConnComp(), pixDestroy(), pixRead(), sarrayDestroy(), sarrayGetCount(), and sarrayGetString().

◆ convertSegmentedFilesToPdf()

l_ok convertSegmentedFilesToPdf ( const char *  dirname,
const char *  substr,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXAA baa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
const char *  fileout 
)

convertSegmentedFilesToPdf()

Parameters
[in]dirnamedirectory name containing images
[in]substr[optional] substring filter on filenames; can be NULL
[in]resinput resolution of all images
[in]typecompression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE
[in]threshused for converting gray --> 1 bpp with L_G4_ENCODE
[in]baa[optional] boxaa of image regions
[in]qualityused for JPEG only; 0 for default (75)
[in]scalefactorscaling factor applied to each image region
[in]title[optional] pdf title; if null, taken from the first image filename
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) If substr is not NULL, only image filenames that contain
         the substring can be used.  If substr == NULL, all files
         in the directory are used.
     (2) The files in the directory, after optional filtering by
         the substring, are lexically sorted in increasing order
         before concatenation.
     (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
         colormap and many colors, or 32 bpp; FLATE for anything else.
     (4) The boxaa, if it exists, contains one boxa of "image regions"
         for each image file.  The boxa must be aligned with the
         sorted set of images.
     (5) The scalefactor is applied to each image region.  It is
         typically < 1.0, to save bytes in the final pdf, because
         the resolution is often not critical in non-text regions.
     (6) If the non-image regions have pixel depth > 1 and the encoding
         type is G4, they are automatically scaled up by 2x and
         thresholded.  Otherwise, no scaling is performed on them.
     (7) Note that this function can be used to generate multipage
         G4 compressed pdf from any input, by using boxaa == NULL
         and type == L_G4_ENCODE.

Definition at line 1469 of file pdfio1.c.

References boxaaExtendWithInit(), boxaaGetBoxa(), boxaaGetCount(), boxaCreate(), boxaDestroy(), boxaGetCount(), convertToPdfDataSegmented(), getNumberedPathnamesInDirectory(), l_binaryWrite(), l_byteaDestroy(), l_byteaInitFromMem(), L_CLONE, L_NO_COMPACTION, L_NOCOPY, ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayDestroy(), sarrayGetCount(), and sarrayGetString().

◆ convertToPdf()

l_ok convertToPdf ( const char *  filein,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
const char *  title,
L_PDF_DATA **  plpd,
l_int32  position 
)

convertToPdf()

Parameters
[in]fileininput image file – any format
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, or L_JP2K_ENCODE)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]fileoutoutput pdf file; only required on last image on page
[in]x,ylocation of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]title[optional] pdf title; if null, taken from filein
[in,out]plpdptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed
[in]positionin image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE
Returns
0 if OK, 1 on error
Notes:
     (1) To wrap only one image in pdf, input plpd = NULL, and
         the value of position will be ignored:
           convertToPdf(...  type, quality, x, y, res, NULL, 0);
     (2) To wrap multiple images on a single pdf page, this is called
         once for each successive image.  Do it this way:
           L_PDF_DATA   *lpd;
           convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
           convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
           ...
           convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
         This will write the result to the value of fileout specified
         in the first call; succeeding values of fileout are ignored.
         On the last call: the pdf data bytes are computed and written
         to fileout, lpd is destroyed internally, and the returned
         value of lpd is null.  So the client has nothing to clean up.
     (3) (a) Set res == 0 to respect the resolution embedded in the
             image file.  If no resolution is embedded, it will be set
             to the default value.
         (b) Set res to some other value to override the file resolution.
     (4) (a) If the input res and the resolution of the output device
             are equal, the image will be "displayed" at the same size
             as the original.
         (b) If the input res is 72, the output device will render
             the image at 1 pt/pixel.
         (c) Some possible choices for the default input pix resolution are:
                72 ppi     Render pix on any output device at one pt/pixel
                96 ppi     Windows default for generated display images
               300 ppi     Typical default for scanned images.
             We choose 300, which is sensible for rendering page images.
             However,  images come from a variety of sources, and
             some are explicitly created for viewing on a display.

Definition at line 999 of file pdfio1.c.

References convertToPdfData(), l_binaryWrite(), and L_LAST_IMAGE.

◆ convertToPdfData()

l_ok convertToPdfData ( const char *  filein,
l_int32  type,
l_int32  quality,
l_uint8 **  pdata,
size_t *  pnbytes,
l_int32  x,
l_int32  y,
l_int32  res,
const char *  title,
L_PDF_DATA **  plpd,
l_int32  position 
)

convertToPdfData()

Parameters
[in]fileininput image file – any format
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, or L_JP2K_ENCODE)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[out]pdatapdf data in memory
[out]pnbytesnumber of bytes in pdf data
[in]x,ylocation of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]title[optional] pdf title; if null, use filein
[in,out]plpdptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed
[in]positionin image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE
Returns
0 if OK, 1 on error
Notes:
     (1) If res == 0 and the input resolution field is 0,
         this will use DefaultInputRes.
     (2) See comments in convertToPdf().

Definition at line 1140 of file pdfio1.c.

References pixConvertToPdfData(), pixDestroy(), and pixRead().

Referenced by convertToPdf().

◆ convertToPdfDataSegmented()

l_ok convertToPdfDataSegmented ( const char *  filein,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

convertToPdfDataSegmented()

Parameters
[in]fileininput image file – any format
[in]resinput image resolution; typ. 300 ppi; use 0 for default
[in]typecompression type for non-image regions; image regions are always compressed with L_JPEG_ENCODE
[in]threshfor converting gray --> 1 bpp with L_G4_ENCODE
[in]boxa[optional] image regions; can be null
[in]qualityused for jpeg image regions; 0 for default
[in]scalefactorused for jpeg regions; must be <= 1.0
[in]title[optional] pdf title; if null, uses filein
[out]pdatapdf data in memory
[out]pnbytesnumber of bytes in pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) If there are no image regions, set boxa == NULL;
         quality and scalefactor are ignored.
     (2) Typically, scalefactor is < 1.0.  The image regions are

Definition at line 1823 of file pdfio1.c.

References L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, pixConvertToPdfDataSegmented(), pixDestroy(), and pixRead().

Referenced by convertSegmentedFilesToPdf().

◆ convertToPdfSegmented()

l_ok convertToPdfSegmented ( const char *  filein,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
const char *  fileout 
)

convertToPdfSegmented()

Parameters
[in]fileininput image file – any format
[in]resinput image resolution; typ. 300 ppi; use 0 for default
[in]typecompression type for non-image regions; image regions are always compressed with L_JPEG_ENCODE
[in]threshfor converting gray --> 1 bpp with L_G4_ENCODE
[in]boxa[optional] of image regions; can be null
[in]qualityused for jpeg image regions; 0 for default
[in]scalefactorused for jpeg regions; must be <= 1.0
[in]title[optional] pdf title; typically taken from the input file for the pix
[in]fileoutoutput pdf file
Returns
0 if OK, 1 on error
Notes:
     (1) If there are no image regions, set boxa == NULL;
         quality and scalefactor are ignored.
     (2) Typically, scalefactor is < 1.0, because the image regions
         can be rendered at a lower resolution (for better compression)
         than the text regions.  If scalefactor == 0, we use 1.0.
         If the input image is 1 bpp and scalefactor < 1.0, we
         use scaleToGray() to downsample the image regions to gray
         before compressing them.
     (3) If the compression type for non-image regions is L_G4_ENCODE
         and bpp > 1, the image is upscaled 2x and thresholded
         to 1 bpp.  That is the only situation where thresh is used.
     (4) The parameter quality is only used for image regions.
         If type == L_JPEG_ENCODE, default jpeg quality (75) is
         used for the non-image regions.
     (5) Processing matrix for non-image regions.
         Input           G4              JPEG                FLATE
         ----------|---------------------------------------------------
         1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
                   |
         cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
                   |
         2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
         no cmap   |                  2,4 bpp
                   |
         8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
         no cmap   |                  8,32 bpp
         Summary:
         (a) if G4 is requested, G4 is used, with 2x upscaling
             for all cases except 1 bpp.
         (b) if JPEG is requested, use flate encoding for all cases
             except 8 bpp without cmap and 32 bpp (rgb).
         (c) if FLATE is requested, use flate with no transformation
             of the raster data.
     (6) Calling options/sequence for these functions:
             file  -->  file      (convertToPdfSegmented)
                 pix  -->  file      (pixConvertToPdfSegmented)
                     pix  -->  data      (pixConvertToPdfDataSegmented)
             file  -->  data      (convertToPdfDataSegmented)
                     pix  -->  data      (pixConvertToPdfDataSegmented)

Definition at line 1698 of file pdfio1.c.

References L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, pixConvertToPdfSegmented(), pixDestroy(), and pixRead().

◆ convertUnscaledFilesToPdf()

l_ok convertUnscaledFilesToPdf ( const char *  dirname,
const char *  substr,
const char *  title,
const char *  fileout 
)

convertUnscaledFilesToPdf()

Parameters
[in]dirnamedirectory name containing images
[in]substr[optional] substring filter on filenames; can be NULL
[in]title[optional] pdf title; if null, taken from the first image filename
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) If substr is not NULL, only image filenames that contain
         the substring can be used.  If substr == NULL, all files
         in the directory are used.
     (2) The files in the directory, after optional filtering by
         the substring, are lexically sorted in increasing order
         before concatenation.
     (3) This is very fast for jpeg, jp2k and some png files,
         because the compressed data is wrapped up and concatenated.
         For other types of png, the images must be read and recompressed.

Definition at line 540 of file pdfio1.c.

References getSortedPathnamesInDirectory(), saConvertUnscaledFilesToPdf(), and sarrayDestroy().

◆ convertUnscaledToPdfData()

l_ok convertUnscaledToPdfData ( const char *  fname,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

convertUnscaledToPdfData()

Parameters
[in]fnameof image file in all formats
[in]title[optional] pdf title; can be NULL
[out]pdataoutput pdf data for image
[out]pnbytessize of output pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) This is very fast for jpeg, jp2k and some png files,
         because the compressed data is wrapped up and concatenated.
         For other types of png, the images must be read and recompressed.

Definition at line 702 of file pdfio1.c.

References findFileFormat().

Referenced by saConvertUnscaledFilesToPdfData().

◆ pixaConvertToPdf()

l_ok pixaConvertToPdf ( PIXA pixa,
l_int32  res,
l_float32  scalefactor,
l_int32  type,
l_int32  quality,
const char *  title,
const char *  fileout 
)

pixaConvertToPdf()

Parameters
[in]pixacontaining images all at the same resolution
[in]resoverride the resolution of each input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]scalefactorscaling factor applied to each image; > 0.0
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE, or L_DEFAULT_ENCODE for default)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]title[optional] pdf title
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
         colormap and many colors, or 32 bpp; FLATE for anything else.
     (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
     (3) Specifying one of the three encoding types for type forces
         all images to be compressed with that type.  Use 0 to have
         the type determined for each image based on depth and whether
         or not it has a colormap.

Definition at line 790 of file pdfio1.c.

References l_binaryWrite(), and pixaConvertToPdfData().

Referenced by convertTiffMultipageToPdf(), pixaCompareInPdf(), pixaSelectToPdf(), and pixCompareWithTranslation().

◆ pixaConvertToPdfData()

l_ok pixaConvertToPdfData ( PIXA pixa,
l_int32  res,
l_float32  scalefactor,
l_int32  type,
l_int32  quality,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

pixaConvertToPdfData()

Parameters
[in]pixacontaining images all at the same resolution
[in]resinput resolution of all images
[in]scalefactorscaling factor applied to each image; > 0.0
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE, or L_DEFAULT_ENCODE for default)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]title[optional] pdf title
[out]pdataoutput pdf data of all images
[out]pnbytessize of output pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) See pixaConvertToPdf().

Definition at line 844 of file pdfio1.c.

References l_byteaDestroy(), l_byteaInitFromMem(), L_CLONE, L_DEFAULT_ENCODE, L_FLATE_ENCODE, L_G4_ENCODE, L_JP2K_ENCODE, L_JPEG_ENCODE, L_NO_COMPACTION, pixaGetCount(), pixaGetPix(), pixClone(), pixConvertToPdfData(), pixDestroy(), pixScale(), ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), and selectDefaultPdfEncoding().

Referenced by pixaConvertToPdf().

◆ pixConvertToPdf()

l_ok pixConvertToPdf ( PIX pix,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
const char *  title,
L_PDF_DATA **  plpd,
l_int32  position 
)

pixConvertToPdf()

Parameters
[in]pix
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]fileoutoutput pdf file; only required on last image on page
[in]x,ylocation of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input images
[in]title[optional] pdf title
[in,out]plpdptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed
[in]positionin image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE
Returns
0 if OK, 1 on error
Notes:
     (1) If res == 0 and the input resolution field is 0,
         this will use DefaultInputRes.
     (2) This only writes data to fileout if it is the last
         image to be written on the page.
     (3) See comments in convertToPdf().

Definition at line 1286 of file pdfio1.c.

References l_binaryWrite(), L_LAST_IMAGE, and pixConvertToPdfData().

Referenced by convertImageDataToPdf().

◆ pixConvertToPdfDataSegmented()

l_ok pixConvertToPdfDataSegmented ( PIX pixs,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

pixConvertToPdfDataSegmented()

Parameters
[in]pixsany depth, cmap OK
[in]resinput image resolution; typ. 300 ppi; use 0 for default
[in]typecompression type for non-image regions; image regions are always compressed with L_JPEG_ENCODE
[in]threshfor converting gray --> 1 bpp with L_G4_ENCODE
[in]boxa[optional] of image regions; can be null
[in]qualityused for jpeg image regions; 0 for default
[in]scalefactorused for jpeg regions; must be <= 1.0
[in]title[optional] pdf title; typically taken from the input file for the pix
[out]pdatapdf data in memory
[out]pnbytesnumber of bytes in pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) See convertToPdfSegmented() for details.

Definition at line 1891 of file pdfio1.c.

References L_FLATE_ENCODE, L_G4_ENCODE, and L_JPEG_ENCODE.

Referenced by convertToPdfDataSegmented(), and pixConvertToPdfSegmented().

◆ pixConvertToPdfSegmented()

l_ok pixConvertToPdfSegmented ( PIX pixs,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
const char *  fileout 
)

pixConvertToPdfSegmented()

Parameters
[in]pixsany depth, cmap OK
[in]resinput image resolution; typ. 300 ppi; use 0 for default
[in]typecompression type for non-image regions; image regions are always compressed with L_JPEG_ENCODE
[in]threshfor converting gray --> 1 bpp with L_G4_ENCODE
[in]boxa[optional] of image regions; can be null
[in]qualityused for jpeg image regions; 0 for default
[in]scalefactorused for jpeg regions; must be <= 1.0
[in]title[optional] pdf title; typically taken from the input file for the pix
[in]fileoutoutput pdf file
Returns
0 if OK, 1 on error
Notes:
     (1) See convertToPdfSegmented() for details.

Definition at line 1759 of file pdfio1.c.

References l_binaryWrite(), L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, and pixConvertToPdfDataSegmented().

Referenced by convertToPdfSegmented().

◆ pixWriteMemPdf()

l_ok pixWriteMemPdf ( l_uint8 **  pdata,
size_t *  pnbytes,
PIX pix,
l_int32  res,
const char *  title 
)

pixWriteMemPdf()

Parameters
[out]pdatapdf as byte array
[out]pnbytesnumber of bytes in pdf array
[in]pixall depths, cmap OK
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the res embedded in the input
[in]title[optional] pdf title; taken from the first image placed on a page; e.g., an input image filename
Returns
0 if OK, 1 on error
Notes:
     (1) This is the simplest interface for writing a single image
         with pdf encoding to memory.  It uses G4 encoding for 1 bpp,
         and makes a guess whether to use JPEG or FLATE encoding for
         everything else.

Definition at line 1395 of file pdfio1.c.

References pixConvertToPdfData(), and selectDefaultPdfEncoding().

Referenced by pixWriteStreamPdf().

◆ pixWriteStreamPdf()

l_ok pixWriteStreamPdf ( FILE *  fp,
PIX pix,
l_int32  res,
const char *  title 
)

pixWriteStreamPdf()

Parameters
[in]fpfile stream opened for writing
[in]pixall depths, cmap OK
[in]resoverride the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input
[in]title[optional] pdf title; taken from the first image placed on a page; e.g., an input image filename
Returns
0 if OK, 1 on error
Notes:
     (1) This is the simplest interface for writing a single image
         with pdf encoding to a stream.  It uses G4 encoding for 1 bpp,
         JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
         encoding for everything else.

Definition at line 1346 of file pdfio1.c.

References pixWriteMemPdf().

◆ ptraConcatenatePdf()

l_ok ptraConcatenatePdf ( L_PTRA pa,
const char *  fileout 
)

ptraConcatenatePdf()

Parameters
[in]paarray of pdf strings, each for a single-page pdf file
[in]fileoutconcatenated pdf file
Returns
0 if OK, 1 on error
Notes:
     (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2126 of file pdfio1.c.

References l_binaryWrite(), and ptraConcatenatePdfToData().

◆ saConcatenatePdf()

l_ok saConcatenatePdf ( SARRAY sa,
const char *  fileout 
)

saConcatenatePdf()

Parameters
[in]sastring array of pathnames for single-page pdf files
[in]fileoutconcatenated pdf file
Returns
0 if OK, 1 on error
Notes:
     (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2090 of file pdfio1.c.

References l_binaryWrite(), and saConcatenatePdfToData().

Referenced by concatenatePdf().

◆ saConcatenatePdfToData()

l_ok saConcatenatePdfToData ( SARRAY sa,
l_uint8 **  pdata,
size_t *  pnbytes 
)

saConcatenatePdfToData()

Parameters
[in]sastring array of pathnames for single-page pdf files
[out]pdataconcatenated pdf data in memory
[out]pnbytesnumber of bytes in pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2211 of file pdfio1.c.

References l_byteaDestroy(), l_byteaInitFromFile(), L_NO_COMPACTION, L_NOCOPY, ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayGetCount(), and sarrayGetString().

Referenced by concatenatePdfToData(), and saConcatenatePdf().

◆ saConvertFilesToPdf()

l_ok saConvertFilesToPdf ( SARRAY sa,
l_int32  res,
l_float32  scalefactor,
l_int32  type,
l_int32  quality,
const char *  title,
const char *  fileout 
)

saConvertFilesToPdf()

Parameters
[in]sastring array of pathnames for images
[in]resinput resolution of all images
[in]scalefactorscaling factor applied to each image; > 0.0
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE or L_DEFAULT_ENCODE for default)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]title[optional] pdf title; if null, taken from the first image filename
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) See convertFilesToPdf().

Definition at line 303 of file pdfio1.c.

References l_binaryWrite(), and saConvertFilesToPdfData().

Referenced by convertFilesToPdf().

◆ saConvertFilesToPdfData()

l_ok saConvertFilesToPdfData ( SARRAY sa,
l_int32  res,
l_float32  scalefactor,
l_int32  type,
l_int32  quality,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

saConvertFilesToPdfData()

Parameters
[in]sastring array of pathnames for images
[in]resinput resolution of all images
[in]scalefactorscaling factor applied to each image; > 0.0
[in]typeencoding type (L_JPEG_ENCODE, L_G4_ENCODE, L_FLATE_ENCODE, L_JP2K_ENCODE or L_DEFAULT_ENCODE for default)
[in]qualityfor jpeg: 1-100; 0 for default (75) for jp2k: 27-45; 0 for default (34)
[in]title[optional] pdf title; if null, taken from the first image filename
[out]pdataoutput pdf data (of all images
[out]pnbytessize of output pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) See convertFilesToPdf().

Definition at line 358 of file pdfio1.c.

References l_byteaDestroy(), l_byteaInitFromMem(), L_DEFAULT_ENCODE, L_FLATE_ENCODE, L_G4_ENCODE, L_JP2K_ENCODE, L_JPEG_ENCODE, L_NO_COMPACTION, L_NOCOPY, lept_stderr(), pixClone(), pixConvertToPdfData(), pixDestroy(), pixRead(), pixScale(), ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayGetCount(), sarrayGetString(), and selectDefaultPdfEncoding().

Referenced by saConvertFilesToPdf().

◆ saConvertUnscaledFilesToPdf()

l_ok saConvertUnscaledFilesToPdf ( SARRAY sa,
const char *  title,
const char *  fileout 
)

saConvertUnscaledFilesToPdf()

Parameters
[in]sastring array of pathnames for images
[in]title[optional] pdf title; if null, taken from the first image filename
[in]fileoutpdf file of all images
Returns
0 if OK, 1 on error
Notes:
     (1) See convertUnscaledFilesToPdf().

Definition at line 578 of file pdfio1.c.

References l_binaryWrite(), and saConvertUnscaledFilesToPdfData().

Referenced by convertUnscaledFilesToPdf().

◆ saConvertUnscaledFilesToPdfData()

l_ok saConvertUnscaledFilesToPdfData ( SARRAY sa,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

saConvertUnscaledFilesToPdfData()

Parameters
[in]sastring array of pathnames for image files
[in]title[optional] pdf title; if null, taken from the first image filename
[out]pdataoutput pdf data (of all images)
[out]pnbytessize of output pdf data
Returns
0 if OK, 1 on error
Notes:
     (1) This is very fast for jpeg, jp2k and some png files,
         because the compressed data is wrapped up and concatenated.
         For other types of png, the images must be read and recompressed.

Definition at line 623 of file pdfio1.c.

References convertUnscaledToPdfData(), l_byteaDestroy(), l_byteaInitFromMem(), L_NO_COMPACTION, L_NOCOPY, lept_stderr(), ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayGetCount(), and sarrayGetString().

Referenced by saConvertUnscaledFilesToPdf().

◆ selectDefaultPdfEncoding()

l_ok selectDefaultPdfEncoding ( PIX pix,
l_int32 *  ptype 
)

selectDefaultPdfEncoding()

Parameters
[in]pix
[out]ptypeL_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
Returns
0 if OK, 1 on error
Notes:
     (1) This attempts to choose an encoding for the pix that results
         in the smallest file, assuming that if jpeg encoded, it will
         use quality = 75.  The decision is approximate, in that
         (a) all colormapped images will be losslessly encoded with
         gzip (flate), and (b) an image with less than about 20 colors
         is likely to be smaller if flate encoded than if encoded
         as a jpeg (dct).  For example, an image made by pixScaleToGray3()
         will have 10 colors, and flate encoding will give about
         twice the compression as jpeg with quality = 75.

Definition at line 477 of file pdfio1.c.

References L_FLATE_ENCODE, and pixGetDimensions().

Referenced by convertImageDataToPdf(), convertImageDataToPdfData(), pixaConvertToPdfData(), pixConvertToPdfData(), pixGenerateCIData(), pixWriteMemPdf(), and saConvertFilesToPdfData().