Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

TextOutputDev.h

Go to the documentation of this file.
00001 //========================================================================
00002 //
00003 // TextOutputDev.h
00004 //
00005 // Copyright 1997-2003 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #ifndef TEXTOUTPUTDEV_H
00010 #define TEXTOUTPUTDEV_H
00011 
00012 #include <aconf.h>
00013 
00014 #ifdef USE_GCC_PRAGMAS
00015 #pragma interface
00016 #endif
00017 
00018 #include <stdio.h>
00019 #include "gtypes.h"
00020 #include "GfxFont.h"
00021 #include "OutputDev.h"
00022 
00023 class GString;
00024 class GList;
00025 class GfxFont;
00026 class GfxState;
00027 class UnicodeMap;
00028 
00029 //------------------------------------------------------------------------
00030 
00031 typedef void (*TextOutputFunc)(void *stream, char *text, int len);
00032 
00033 //------------------------------------------------------------------------
00034 // TextFontInfo
00035 //------------------------------------------------------------------------
00036 
00037 class TextFontInfo {
00038 public:
00039 
00040   TextFontInfo(GfxState *state);
00041   ~TextFontInfo();
00042 
00043   GBool matches(GfxState *state);
00044 
00045 private:
00046 
00047   GfxFont *gfxFont;
00048 #if TEXTOUT_WORD_LIST
00049   GString *fontName;
00050 #endif
00051 
00052   friend class TextWord;
00053   friend class TextPage;
00054 };
00055 
00056 //------------------------------------------------------------------------
00057 // TextWord
00058 //------------------------------------------------------------------------
00059 
00060 class TextWord {
00061 public:
00062 
00063   // Constructor.
00064   TextWord(GfxState *state, int rotA, double x0, double y0,
00065            int charPosA, TextFontInfo *fontA, double fontSize);
00066 
00067   // Destructor.
00068   ~TextWord();
00069 
00070   // Add a character to the word.
00071   void addChar(GfxState *state, double x, double y,
00072                double dx, double dy, Unicode u);
00073 
00074   // Merge <word> onto the end of <this>.
00075   void merge(TextWord *word);
00076 
00077   // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
00078   // based on a primary-axis comparison, e.g., x ordering if rot=0.
00079   int primaryCmp(TextWord *word);
00080 
00081   // Return the distance along the primary axis between <this> and
00082   // <word>.
00083   double primaryDelta(TextWord *word);
00084 
00085   static int cmpYX(const void *p1, const void *p2);
00086 
00087 #if TEXTOUT_WORD_LIST
00088   int getLength() { return len; }
00089   Unicode getChar(int idx) { return text[idx]; }
00090   GString *getText();
00091   GString *getFontName() { return font->fontName; }
00092   void getColor(double *r, double *g, double *b)
00093     { *r = colorR; *g = colorG; *b = colorB; }
00094   void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
00095     { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
00096   int getCharPos() { return charPos; }
00097   int getCharLen() { return charLen; }
00098 #endif
00099 
00100 private:
00101 
00102   int rot;                      // rotation, multiple of 90 degrees
00103                                 //   (0, 1, 2, or 3)
00104   double xMin, xMax;            // bounding box x coordinates
00105   double yMin, yMax;            // bounding box y coordinates
00106   double base;                  // baseline x or y coordinate
00107   Unicode *text;                // the text
00108   double *edge;                 // "near" edge x or y coord of each char
00109                                 //   (plus one extra entry for the last char)
00110   int len;                      // length of text and edge arrays
00111   int size;                     // size of text and edge arrays
00112   int charPos;                  // character position (within content stream)
00113   int charLen;                  // number of content stream characters in
00114                                 //   this word
00115   TextFontInfo *font;           // font information
00116   double fontSize;              // font size
00117   GBool spaceAfter;             // set if there is a space between this
00118                                 //   word and the next word on the line
00119   TextWord *next;               // next word in line
00120 
00121 #if TEXTOUT_WORD_LIST
00122   double colorR,                // word color
00123          colorG,
00124          colorB;
00125 #endif
00126 
00127   friend class TextPool;
00128   friend class TextLine;
00129   friend class TextBlock;
00130   friend class TextFlow;
00131   friend class TextWordList;
00132   friend class TextPage;
00133 };
00134 
00135 //------------------------------------------------------------------------
00136 // TextPool
00137 //------------------------------------------------------------------------
00138 
00139 class TextPool {
00140 public:
00141 
00142   TextPool();
00143   ~TextPool();
00144 
00145   TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
00146   void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
00147 
00148   int getBaseIdx(double base);
00149 
00150   void addWord(TextWord *word);
00151 
00152 private:
00153 
00154   int minBaseIdx;               // min baseline bucket index
00155   int maxBaseIdx;               // max baseline bucket index
00156   TextWord **pool;              // array of linked lists, one for each
00157                                 //   baseline value (multiple of 4 pts)
00158   TextWord *cursor;             // pointer to last-accessed word
00159   int cursorBaseIdx;            // baseline bucket index of last-accessed word
00160 
00161   friend class TextBlock;
00162   friend class TextPage;
00163 };
00164 
00165 //------------------------------------------------------------------------
00166 // TextLine
00167 //------------------------------------------------------------------------
00168 
00169 class TextLine {
00170 public:
00171 
00172   TextLine(TextBlock *blkA, int rotA, double baseA);
00173   ~TextLine();
00174 
00175   void addWord(TextWord *word);
00176 
00177   // Return the distance along the primary axis between <this> and
00178   // <line>.
00179   double primaryDelta(TextLine *line);
00180 
00181   // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
00182   // based on a primary-axis comparison, e.g., x ordering if rot=0.
00183   int primaryCmp(TextLine *line);
00184 
00185   // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
00186   // based on a secondary-axis comparison of the baselines, e.g., y
00187   // ordering if rot=0.
00188   int secondaryCmp(TextLine *line);
00189 
00190   int cmpYX(TextLine *line);
00191 
00192   static int cmpXY(const void *p1, const void *p2);
00193 
00194   void coalesce(UnicodeMap *uMap);
00195 
00196 private:
00197 
00198   TextBlock *blk;               // parent block
00199   int rot;                      // text rotation
00200   double xMin, xMax;            // bounding box x coordinates
00201   double yMin, yMax;            // bounding box y coordinates
00202   double base;                  // baseline x or y coordinate
00203   TextWord *words;              // words in this line
00204   TextWord *lastWord;           // last word in this line
00205   Unicode *text;                // Unicode text of the line, including
00206                                 //   spaces between words
00207   double *edge;                 // "near" edge x or y coord of each char
00208                                 //   (plus one extra entry for the last char)
00209   int *col;                     // starting column number of each Unicode char
00210   int len;                      // number of Unicode chars
00211   int convertedLen;             // total number of converted characters
00212   GBool hyphenated;             // set if last char is a hyphen
00213   TextLine *next;               // next line in block
00214 
00215   friend class TextLineFrag;
00216   friend class TextBlock;
00217   friend class TextFlow;
00218   friend class TextWordList;
00219   friend class TextPage;
00220 };
00221 
00222 //------------------------------------------------------------------------
00223 // TextBlock
00224 //------------------------------------------------------------------------
00225 
00226 class TextBlock {
00227 public:
00228 
00229   TextBlock(TextPage *pageA, int rotA);
00230   ~TextBlock();
00231 
00232   void addWord(TextWord *word);
00233 
00234   void coalesce(UnicodeMap *uMap);
00235 
00236   // Update this block's priMin and priMax values, looking at <blk>.
00237   void updatePriMinMax(TextBlock *blk);
00238 
00239   static int cmpXYPrimaryRot(const void *p1, const void *p2);
00240 
00241   static int cmpYXPrimaryRot(const void *p1, const void *p2);
00242 
00243   int primaryCmp(TextBlock *blk);
00244 
00245   double secondaryDelta(TextBlock *blk);
00246 
00247   // Returns true if <this> is below <blk>, relative to the page's
00248   // primary rotation.
00249   GBool isBelow(TextBlock *blk);
00250 
00251 private:
00252 
00253   TextPage *page;               // the parent page
00254   int rot;                      // text rotation
00255   double xMin, xMax;            // bounding box x coordinates
00256   double yMin, yMax;            // bounding box y coordinates
00257   double priMin, priMax;        // whitespace bounding box along primary axis
00258 
00259   TextPool *pool;               // pool of words (used only until lines
00260                                 //   are built)
00261   TextLine *lines;              // linked list of lines
00262   TextLine *curLine;            // most recently added line
00263   int nLines;                   // number of lines
00264   int charCount;                // number of characters in the block
00265   int col;                      // starting column
00266   int nColumns;                 // number of columns in the block
00267 
00268   TextBlock *next;
00269   TextBlock *stackNext;
00270 
00271   friend class TextLine;
00272   friend class TextLineFrag;
00273   friend class TextFlow;
00274   friend class TextWordList;
00275   friend class TextPage;
00276 };
00277 
00278 //------------------------------------------------------------------------
00279 // TextFlow
00280 //------------------------------------------------------------------------
00281 
00282 class TextFlow {
00283 public:
00284 
00285   TextFlow(TextPage *pageA, TextBlock *blk);
00286   ~TextFlow();
00287 
00288   // Add a block to the end of this flow.
00289   void addBlock(TextBlock *blk);
00290 
00291   // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
00292   // it uses a font no larger than the last block added to the flow,
00293   // and (2) it fits within the flow's [priMin, priMax] along the
00294   // primary axis.
00295   GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
00296 
00297 private:
00298 
00299   TextPage *page;               // the parent page
00300   double xMin, xMax;            // bounding box x coordinates
00301   double yMin, yMax;            // bounding box y coordinates
00302   double priMin, priMax;        // whitespace bounding box along primary axis
00303   TextBlock *blocks;            // blocks in flow
00304   TextBlock *lastBlk;           // last block in this flow
00305   TextFlow *next;
00306 
00307   friend class TextWordList;
00308   friend class TextPage;
00309 };
00310 
00311 #if TEXTOUT_WORD_LIST
00312 
00313 //------------------------------------------------------------------------
00314 // TextWordList
00315 //------------------------------------------------------------------------
00316 
00317 class TextWordList {
00318 public:
00319 
00320   // Build a flat word list, in content stream order (if
00321   // text->rawOrder is true), physical layout order (if <physLayout>
00322   // is true and text->rawOrder is false), or reading order (if both
00323   // flags are false).
00324   TextWordList(TextPage *text, GBool physLayout);
00325 
00326   ~TextWordList();
00327 
00328   // Return the number of words on the list.
00329   int getLength();
00330 
00331   // Return the <idx>th word from the list.
00332   TextWord *get(int idx);
00333 
00334 private:
00335 
00336   GList *words;
00337 };
00338 
00339 #endif // TEXTOUT_WORD_LIST
00340 
00341 //------------------------------------------------------------------------
00342 // TextPage
00343 //------------------------------------------------------------------------
00344 
00345 class TextPage {
00346 public:
00347 
00348   // Constructor.
00349   TextPage(GBool rawOrderA);
00350 
00351   // Destructor.
00352   ~TextPage();
00353 
00354   // Start a new page.
00355   void startPage(GfxState *state);
00356 
00357   // End the current page.
00358   void endPage();
00359 
00360   // Update the current font.
00361   void updateFont(GfxState *state);
00362 
00363   // Begin a new word.
00364   void beginWord(GfxState *state, double x0, double y0);
00365 
00366   // Add a character to the current word.
00367   void addChar(GfxState *state, double x, double y,
00368                double dx, double dy,
00369                CharCode c, Unicode *u, int uLen);
00370 
00371   // End the current word, sorting it into the list of words.
00372   void endWord();
00373 
00374   // Add a word, sorting it into the list of words.
00375   void addWord(TextWord *word);
00376 
00377   // Coalesce strings that look like parts of the same line.
00378   void coalesce(GBool physLayout);
00379 
00380   // Find a string.  If <startAtTop> is true, starts looking at the
00381   // top of the page; else if <startAtLast> is true, starts looking
00382   // immediately after the last find result; else starts looking at
00383   // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
00384   // bottom of the page; else if <stopAtLast> is true, stops looking
00385   // just before the last find result; else stops looking at
00386   // <xMax>,<yMax>.
00387   GBool findText(Unicode *s, int len,
00388                  GBool startAtTop, GBool stopAtBottom,
00389                  GBool startAtLast, GBool stopAtLast,
00390                  double *xMin, double *yMin,
00391                  double *xMax, double *yMax);
00392 
00393   // Get the text which is inside the specified rectangle.
00394   GString *getText(double xMin, double yMin,
00395                    double xMax, double yMax);
00396 
00397   // Find a string by character position and length.  If found, sets
00398   // the text bounding rectangle and returns true; otherwise returns
00399   // false.
00400   GBool findCharRange(int pos, int length,
00401                       double *xMin, double *yMin,
00402                       double *xMax, double *yMax);
00403 
00404   // Dump contents of page to a file.
00405   void dump(void *outputStream, TextOutputFunc outputFunc,
00406             GBool physLayout);
00407 
00408 #if TEXTOUT_WORD_LIST
00409   // Build a flat word list, in content stream order (if
00410   // this->rawOrder is true), physical layout order (if <physLayout>
00411   // is true and this->rawOrder is false), or reading order (if both
00412   // flags are false).
00413   TextWordList *makeWordList(GBool physLayout);
00414 #endif
00415 
00416 private:
00417 
00418   void clear();
00419   void assignColumns(TextLineFrag *frags, int nFrags, int rot);
00420   int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s);
00421 
00422   GBool rawOrder;               // keep text in content stream order
00423 
00424   double pageWidth, pageHeight; // width and height of current page
00425   TextWord *curWord;            // currently active string
00426   int charPos;                  // next character position (within content
00427                                 //   stream)
00428   TextFontInfo *curFont;        // current font
00429   double curFontSize;           // current font size
00430   int nest;                     // current nesting level (for Type 3 fonts)
00431   int nTinyChars;               // number of "tiny" chars seen so far
00432   GBool lastCharOverlap;        // set if the last added char overlapped the
00433                                 //   previous char
00434 
00435   TextPool *pools[4];           // a "pool" of TextWords for each rotation
00436   TextFlow *flows;              // linked list of flows
00437   TextBlock **blocks;           // array of blocks, in yx order
00438   int nBlocks;                  // number of blocks
00439   int primaryRot;               // primary rotation
00440   GBool primaryLR;              // primary direction (true means L-to-R,
00441                                 //   false means R-to-L)
00442   TextWord *rawWords;           // list of words, in raw order (only if
00443                                 //   rawOrder is set)
00444   TextWord *rawLastWord;        // last word on rawWords list
00445 
00446   GList *fonts;                 // all font info objects used on this
00447                                 //   page [TextFontInfo]
00448 
00449   double lastFindXMin,          // coordinates of the last "find" result
00450          lastFindYMin;
00451   GBool haveLastFind;
00452 
00453   friend class TextLine;
00454   friend class TextLineFrag;
00455   friend class TextBlock;
00456   friend class TextFlow;
00457   friend class TextWordList;
00458 };
00459 
00460 //------------------------------------------------------------------------
00461 // TextOutputDev
00462 //------------------------------------------------------------------------
00463 
00464 class TextOutputDev: public OutputDev {
00465 public:
00466 
00467   // Open a text output file.  If <fileName> is NULL, no file is
00468   // written (this is useful, e.g., for searching text).  If
00469   // <physLayoutA> is true, the original physical layout of the text
00470   // is maintained.  If <rawOrder> is true, the text is kept in
00471   // content stream order.
00472   TextOutputDev(char *fileName, GBool physLayoutA,
00473                 GBool rawOrderA, GBool append);
00474 
00475   // Create a TextOutputDev which will write to a generic stream.  If
00476   // <physLayoutA> is true, the original physical layout of the text
00477   // is maintained.  If <rawOrder> is true, the text is kept in
00478   // content stream order.
00479   TextOutputDev(TextOutputFunc func, void *stream,
00480                 GBool physLayoutA, GBool rawOrderA);
00481 
00482   // Destructor.
00483   virtual ~TextOutputDev();
00484 
00485   // Check if file was successfully created.
00486   virtual GBool isOk() { return ok; }
00487 
00488   //---- get info about output device
00489 
00490   // Does this device use upside-down coordinates?
00491   // (Upside-down means (0,0) is the top left corner of the page.)
00492   virtual GBool upsideDown() { return gTrue; }
00493 
00494   // Does this device use drawChar() or drawString()?
00495   virtual GBool useDrawChar() { return gTrue; }
00496 
00497   // Does this device use beginType3Char/endType3Char?  Otherwise,
00498   // text in Type 3 fonts will be drawn with drawChar/drawString.
00499   virtual GBool interpretType3Chars() { return gFalse; }
00500 
00501   // Does this device need non-text content?
00502   virtual GBool needNonText() { return gFalse; }
00503 
00504   //----- initialization and control
00505 
00506   // Start a page.
00507   virtual void startPage(int pageNum, GfxState *state);
00508 
00509   // End a page.
00510   virtual void endPage();
00511 
00512   //----- update text state
00513   virtual void updateFont(GfxState *state);
00514 
00515   //----- text drawing
00516   virtual void beginString(GfxState *state, GString *s);
00517   virtual void endString(GfxState *state);
00518   virtual void drawChar(GfxState *state, double x, double y,
00519                         double dx, double dy,
00520                         double originX, double originY,
00521                         CharCode c, Unicode *u, int uLen);
00522 
00523   //----- special access
00524 
00525   // Find a string.  If <startAtTop> is true, starts looking at the
00526   // top of the page; else if <startAtLast> is true, starts looking
00527   // immediately after the last find result; else starts looking at
00528   // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
00529   // bottom of the page; else if <stopAtLast> is true, stops looking
00530   // just before the last find result; else stops looking at
00531   // <xMax>,<yMax>.
00532   GBool findText(Unicode *s, int len,
00533                  GBool startAtTop, GBool stopAtBottom,
00534                  GBool startAtLast, GBool stopAtLast,
00535                  double *xMin, double *yMin,
00536                  double *xMax, double *yMax);
00537 
00538   // Get the text which is inside the specified rectangle.
00539   GString *getText(double xMin, double yMin,
00540                    double xMax, double yMax);
00541 
00542   // Find a string by character position and length.  If found, sets
00543   // the text bounding rectangle and returns true; otherwise returns
00544   // false.
00545   GBool findCharRange(int pos, int length,
00546                       double *xMin, double *yMin,
00547                       double *xMax, double *yMax);
00548 
00549 #if TEXTOUT_WORD_LIST
00550   // Build a flat word list, in content stream order (if
00551   // this->rawOrder is true), physical layout order (if
00552   // this->physLayout is true and this->rawOrder is false), or reading
00553   // order (if both flags are false).
00554   TextWordList *makeWordList();
00555 #endif
00556 
00557 private:
00558 
00559   TextOutputFunc outputFunc;    // output function
00560   void *outputStream;           // output stream
00561   GBool needClose;              // need to close the output file?
00562                                 //   (only if outputStream is a FILE*)
00563   TextPage *text;               // text for the current page
00564   GBool physLayout;             // maintain original physical layout when
00565                                 //   dumping text
00566   GBool rawOrder;               // keep text in content stream order
00567   GBool ok;                     // set up ok?
00568 };
00569 
00570 #endif

Generated on Wed Nov 3 12:59:05 2004 for Lemur Toolkit by doxygen1.2.18