%!PS

% PS Acrobat PDF Content Extractor
% ==================================
% by  Don Lancaster

% VIEWPDF1.PSL provides a set of utilities for reporting organization and 
% extracting page content from PDF files.

%%%%%%%%%%%%%%%%%%%%%%%%%%% enter filenames here %%%%%%%%%%%%%%%%%%%%%%%%%

   /diskfilesourcefilename (azaucty1.pdf)  store  % last uncommented chosen

   /diskfilesourcefilename (extracx1.pdf)  store
 %  /diskfilesourcefilename (wordfrex.pdf)  store 
    /diskfilesourcefilename (Tesla_Turbine_thesix.pdf)  store

/diskfilesourceheader (C:\\Documents and Settings\\don\\Desktop\\gurugrams\\pdfsnoop\\) store

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% the .PDF file MUST be uncompressed. An UncompressPDF.api plugin is available at
% http://www.tinaja.com/plugins/UncompressPDF.api and elsewhere that can be copied
% into your Windows Acrobat plugin folder. When present, an Uncompressed option
% appears under "save as".

% At present code assumes a single level xref list. This can usually be forced by
% a non-linearized "save as".

% See WORDFREQ.PDF for a tutorial on an earlier word frequency analyzer.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%  Copyright c 2005 by Don Lancaster & Synergetics, Box 809, Thatcher, AZ, 85552
%  (928) 428-4073 Email: don@tinaja.com Website: http://www.tinaja.com
%  Consulting services available http://www.tinaja.com/info01.html
 
%  All commercial rights and all electronic media rights ~fully~ reserved.
%  Linking usually welcome. Reposting expressly forbidden. Version 1.5 

%  To use, modify filenames. Resave as ASCII textfile and send to Distiller. 

%  PRELIMINARY CODE!!! Please report any bugs or enhancements to don@tinaja.com
%  A "no Acrobat file produced" error is normal and expected. Output in log file.

% =========

%  IMPORTANT NOTE: Don Lancaster's file gonzo.ps is recommended but not
%  required for this program.
%  After obvious location mods, uncomment ONE of the following two lines:

% (C:\\Documents and Settings\\don\\Desktop\\gonzo\\gonzo.ps) run  % use internal gonzo
% (A:\\gonzo.ps) run  % use external gonzo

%  NOTE THAT ALL PS FILENAME STRINGS !!!DEMAND!!! DOUBLE REVERSE SLASHES.

%  GONZO20A Guru Gonzo PostScript power tools (Interim release)
%  Includes gonzo justification and layout utilities.

%  Copyright c 1990, 1996, 2001 by Don Lancaster and Synergetics, Box 809,
%  Thatcher Arizona, 5552 (928) 428-4073  don@tinaja.com  support
%  via http://www.tinaja.com  All commercial rights and all electronic
%  media rights **FULLY** reserved. Reposting is expressly forbidden.

% ========

/guru { gonzo begin
ps.util.1 begin printerror nuisance begin} def   

% guru                                           % activate gonzo utilities

% =========

% excerpts from the Gonzo utilities...


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   /mergestr
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   mergestr merges the two top stack strings into one top stack string  
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% mergestr merges the two top stack strings into one top stack string

/mergestr {2 copy length exch length add string dup dup 4 3 roll
4 index length exch putinterval 3 1 roll exch 0 exch putinterval} def


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   /makestring
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%     Converts an array of 0-255 integers into a string. 
%     Used to get from array to string to image
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

/makestring {dup length string dup /NullEncode filter
3 -1 roll {1 index exch write} forall pop} def

%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   /popbubblesort
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   Sorts by popularity on the second array element of [ (string) count ]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

/popbubblesort2 { /curmat1 exch store curmat1 length
1 sub -1 1 {curmat1 0 get exch 1 exch 1 exch {/posn exch
store curmat1 posn get 2 copy 1 get exch 1 get lt {exch}
if curmat1 exch posn 1 sub exch put} for curmat1 exch
posn exch put } for curmat1 } bind store

% The general proceedure for extracting .PDF page content objects is to find a /Root
% entry which leads to the /Catalog object. The /Catalog object is then read
% to find the /Pages object. This is read in turn to find the /Page object
% array. Individual page object strings are found in /Contents objects.
% /Pages can have /Kids which need to be recursively read. 

% Note that simply searching for strings will not work for a variety of reasons.
% Error trapping is presently moderate but not bulletproof.

% This code is presently limited to single list xrefs, unrevised "0" objects,
% and four level nesting of /Pages objects.

%%%%%%%%%%%%%%%%%%%%
%  file reading setup
%%%%%%%%%%%%%%%%%%%%
%  Creates full filenames and file read objects
%%%%%%%%%%%%%%%%%%%%

/sourcefilename diskfilesourceheader                % build full filename
diskfilesourcefilename mergestr store  

/readfile sourcefilename  (r) file store            % set up input read file

/workstring 60000 string store                      %  maximum read line length
/maxtrips 500000 store                              %  infinite loop preventer 



%%%%%%%%%%%%%%%%%%%%
%  /getobjlist
%%%%%%%%%%%%%%%%%%%%
%  Extracts the .PDF object list into an /xrefarray array
%  Presently limited to SINGLE SECTION xref!
%%%%%%%%%%%%%%%%%%%%

/getobjlist {readfile 0 setfileposition              % go to beginning of file
              /failed true store                     % set error flag
              mark                                   % for trailing space removal
      
          maxtrips {                                 % till finished...
            readfile workstring readline             % get .pdf line
                 {(xref) anchorsearch                % look for xref

                   {/failed false store
                    pop pop exit}
                   {pop} ifelse

                  }{xref_not_found} ifelse           % error on loopout
                   } repeat

             failed {xref_not_found}if               % error on no xref


             readfile workstring readline            % get xref data

             { }{no_xref_length} ifelse

             token {/revis exch store}               % get xref revision
                   {no_revision} ifelse

             token {/xrefcount exch store}           % get xref cound
                   {mp_xref_count} ifelse 
             
             cleartomark                              % remove trailing space

            mark                                      % start xref array

           xrefcount {readfile workstring readline

                 { 20 string cvs             

                   token {/offset1 exch store}         % get offset
                   {no_offset} ifelse

                   token {/rev1 exch store}            % get revision
                   {no_revision} ifelse

                   /used1 exch store                   % get used flag
    
                   mark offset1 rev1 used1 ]           % build xref data array

                   }
                {not_enough_xrefs} ifelse} repeat
                ] 
                /xrefarray exch store                 % save xref array
                
             } store



%%%%%%%%%%%%%%%%%%%%
%  /getobj
%%%%%%%%%%%%%%%%%%%%
%  Extracts a .PDF object given its number and a 0 revision
%  Each line of object is returned as a string in an array.
%%%%%%%%%%%%%%%%%%%%


/getobj {  xrefarray exch get 0 get          % read the offset 
           readfile exch setfileposition     % set the offset

         mark                                % start array of objects
         maxtrips {                          % till finished...
            readfile workstring readline     % get .pdf line
          
         { dup length string cvs             % de-reference each line 



 dup (endobj) anchorsearch       % stop at end of object
             {pop pop exit}{pop} ifelse}
           {ran_out_of_document} ifelse      % force error message
                  } repeat
          ]                                  % complete object array
          fixbrokenarrays                    % make arrays one line
                } store

%%%%%%%%%%%%%%%%%%%%
%  /fixbrokenarrays
%%%%%%%%%%%%%%%%%%%%
%  Attempts to place array elements not in a stream all into one line line string.
%%%%%%%%%%%%%%%%%%%%

/fixbrokenarrays {/holdx exch store           % save object

            /notastream true store            % set stream flag
            holdx { (stream) anchorsearch     % test for stream
                    {pop pop /notastream      % and block if present
                     false store exit}
                    {pop} ifelse  } forall

            notastream {                      % only if not a stream

              2000 {  /nochange true store    % set breakout flag
                    holdx length 3 ge {       % only if 3 or more strings

                0 1 holdx length 3 sub  {     % for all but last 3 strings
                       /ptr exch store        % save position
                     holdx ptr get
                            ([) search        % look for array start
                          {pop pop  
                            (]) search not    % look for no array end

                {pop  
                /nochange false store 
                tryrepair}
                {pop pop pop} ifelse           % ] search
                             }{pop} ifelse     % [ search
                                       } for   % end string checks
                                       } if    % 3 or more
                       nochange {exit} if      % exit when not broken

                    } repeat                   % for retires 
                    } if                       % when not a stream
                  holdx                        % restore old or corrected
             } store


% /tryrepair is a service sub for /fixbrokenarrays If a line starts an array but
% does not finish it, the next line is merged with it. Process continues till
% an ending ] occurs. 

/tryrepair {   mark                            % start new array
        0 1 ptr 1 add {holdx exch get} for  
        mergestr                               % merge top two lines on stack 
        ptr 2 add 1 holdx length 1 sub         % get rest of lines
           {holdx exch get} for
           ]                                   % complete array
         /holdx exch store                     % and update 
                } store

%%%%%%%%%%%%%%%%%%%%
%  /getcatobj 
%  Reads /Root to determine the catalog object, then retrieves it.
%  At present, assumes all are revision 0.
%%%%%%%%%%%%%%%%%%%%

   /getcatobj {  readfile 0 setfileposition     % go to beginning of file
         maxtrips {                             % till finished...
            readfile workstring readline        % get .pdf line
            { (/Root) anchorsearch              % look for /Root info
              {pop exit }{pop} ifelse}
             {ran_out_of_document} ifelse       % force error if no root
             } repeat

             token pop /catobjnum exch store    % save object number 
             token pop /catversnum exch store   % save revision (not used yet)   
             pop
             catobjnum 
             dup (\n/Catlog is object )         % report catalog object 
             print exch ==
             getobj
             /catobjarray exch store            % and save
             } store


%%%%%%%%%%%%%%%%%%%%
%  /getpagesobj 
%  Reads /Catalog object to extract the pages object.
%%%%%%%%%%%%%%%%%%%%

/getpagesobj { catobjarray       
               {(/Pages) anchorsearch {pop exit}    % read /Pages from /Catalog
               {pop} ifelse } forall 
       
               token pop exch pop                   % extract /Pages object number
               dup (Top /Pages is object )          % report top /Pages object 
               print exch ==
               getobj
               /toppagesarray exch store            % save top pages info
               } store


%%%%%%%%%%%%%%%%%%%%
%  /R2obj
%%%%%%%%%%%%%%%%%%%%
%  Converts "R" listing string such as  ( [ 60 0 R 1 0 R 25 0 R]) to
%  array of objects such as [ 60 1 25 ]
%  Enter with R string on stack. Exit with object array on stack.
%%%%%%%%%%%%%%%%%%%%

/R2obj {  

        maxtrips {token {                % repeatedly extract tokens
   
        dup 100 string cvs               % change [ name to mark
        ([) eq {pop mark}if
        dup 100 string cvs               % change ] name to array closure
        (]) eq {pop pop ]}if

        dup type (arraytype) eq          % stop when array complete
        not {exch}{exit} ifelse
        }{exit} ifelse} repeat

       dup type (arraytype) eq not       % force error if not array   
       {rnotanarray} if            
       dup length 3 mod 0 ne             % force error if no triplets
       {rarraynottriplets} if 

       /Ostring exch store               % save extracted object string

       false                             % force error if no R's
       0 1 Ostring length 3 div 
       cvi 1 sub  {3 mul 2 add 
       Ostring exch get  (R) ne     
       {pop true} if} for 
       {rnotinarray} if  

      mark                               % start object string array
      0 1 Ostring length 3 div cvi       % scan by triplets 
      1 sub{3 mul /objj exch store
      Ostring objj get 
      } for ]                            % complete array
             } store

%%%%%%%%%%%%%%%%%%%%
%  /getcontentarray
%  Tests each pages object in turn. 
%    If /Type /Pages, goes deeper for more /Kids
%    If /Type /Page, adds objects to content list.
%
%  /pagearray is an array of page content objects. One page can have many objects.
%  /numberedpagearray has one array entry per page. One array can have many objects.
%
%%%%%%%%%%%%%%%%%%%%

/pagearray [] store                              % empty array gets expanded
/numberedpagearray [] store                      % empty array gets expanded

/getcontentarray { toppagesarray {     
           (/Kids) anchorsearch                  % if /Pages /Kids are present            
           { pop R2obj  firstkids exit}          % go down one level
                                {pop} ifelse
                                 } forall

                  toppagesarray {                % if /Pages /Contents are present
           (/Contents) anchorsearch              %    add to content arrays
           {pop  R2obj addtocontentarray exit}
                                {pop} ifelse
                                } forall
                                 } store

% /Firstkids is called by /getcontentarray if there is more than one level
% of /Pages objects. Normally, only SIX /Page objects is permitted.

/firstkids {                                    % report first nesting 
           (   A first kid /Pages array is ) 
           print dup ==
           /firstkidarray exch store            % and save

 firstkidarray
          { getobj                              % for all first kids    
               {
          (/Kids) anchorsearch                  % if /Pages /Kids are present
          {pop  R2obj  secondkids exit}         %    go down one level
                                {pop} ifelse
                } forall  
           } forall 

firstkidarray 
           { dup /curpage exch store  
           getobj                               % for all first kids
                    { 
           (/Contents) anchorsearch             % if /Page /Contents is present
           {pop  addtocontentarray exit}        %    add to content arrays     
                           {pop} ifelse
                    } forall
           } forall
     } store


% /secondkids is called by /firstkids if there is more than two levels
% of /Pages objects. Normally, only SIX /Page objects is permitted.

/secondkids { 
          (      A second kid /Pages array is ) % report second nesting 
          print dup ==
          /secondkidarray exch store            % and save        

 secondkidarray 

          { getobj                              % for all second kids
                      {

           (/Kids) anchorsearch                 % if /Pages /Kids are present
             {pop R2obj thirdkids exit}         %    go down one level
             {pop} ifelse
                      } forall  
           } forall

secondkidarray                                  % for all second kids

           { dup /curpage exch store  
           getobj
                    { 
           (/Contents) anchorsearch             % If /Page /Contents is present
           {pop addtocontentarray exit}         %    add to content arrays
            {pop} ifelse
                    } forall
           } forall
 
     } store

% /thirdkids is called by /secondkids if there is more than three levels
% of /Pages objects. Normally, only SIX /Page objects is permitted.

/thirdkids { 

          (          A third kid /Pages array is ) % report third nesting 
          print dup ==
            /thirdkidarray exch store              % and save

thirdkidarray 
          
           { getobj                                % for all third kids 
                    {
           (/Kids) anchorsearch                    % If /Pages /Kids are present
           {pop fourthkids exit}                   %    go down one level
           {pop} ifelse
                    } forall
           } forall 
           
thirdkidarray 

           { dup /curpage exch store               % for all third kids 
           getobj
                    { 
           (/Contents) anchorsearch                % if /Pages /Contents are present
           {pop addtocontentarray exit}            %    add to content arrays
           {pop} ifelse
                    } forall
           } forall
           } store

% /fourthkids is called by /thirdkids if there is more than four levels
% of /Pages objects. This level is assumed to be all /Contents if present.
% Presently restricted to approximately 1296 pages max.

/fourthkids { 
          (          A fourth kid /Pages array is ) % report third nesting   
             /fourthkidarray exch store             % and save

fourthkidarray 
           { dup /curpage exch store                % for all fourth kids 
           getobj
                    { 
           (/Contents) anchorsearch                 % If /Pages /Contents are present 
           {pop addtocontentarray exit}             %    add to content arrays
           {pop} ifelse
                    } forall
           } forall
     } store

%%%%%%%%%%%%%%%%%%%%
%  /addtocontentarray
%%%%%%%%%%%%%%%%%%%%
%  Adds new content to /pagearray and /numberedpagearray
%  /pagearray is a list of content objects as encountered.
%  /numberedpagearray is a list of BY PAGE content arrays.
%%%%%%%%%%%%%%%%%%%%

/addtocontentarray { token pop exch pop             % get content obj as numeric 
                    /curcon exch store

                  (              /Page )            % report current /Page object 
                  curpage 200 string cvs mergestr
                  ( holds contents of ) mergestr 
                  curcon 200 string cvs mergestr 
                  (.\n) mergestr print

                  curcon getobj                     % get the object  
                         
                  /isstream false store             % a stream or an object array?
                     {
                 (/Length) search                   % a stream if found   
                 { pop pop pop                      %    assumed to be an object 
                 /isstream true store}              %    array of streams if not
                 {pop} ifelse
                     } forall

                  isstream {curcon setasstream}     % add ONE stream to contents
                  {curcon getobj 1 get R2obj        % add ARRAY of streams instead
                  setasarray} ifelse
  
                 } store

% /setasstream adds ONE stream object number to /pagearray or adds a
%  UNIT LENGTH array to /numberedpagearray.

/setasstream { /holdx exch store                    % save value
               /pagearray mark  pagearray           % one value to pagearray 
               aload pop holdx ] store

               /numberedpagearray mark              % one [value] to numberedpagearray 
               numberedpagearray  aload pop 
               mark holdx ] ] store
               } store

% /setasarray adds ONE stream object array to /pagearray or adds a
%  SEVERAL STREAM LENGTH array to /numberedpagearray.

/setasarray { /holdx exch store                     % save [234 345 347] array
                       
              /pagearray mark pagearray aload       % add as 234 345 347
              pop holdx aload pop ] store

              /numberedpagearray mark               % add as [234 345 347] array
              numberedpagearray aload pop holdx
              ] store

            } store

%%%%%%%%%%%%%%%%%%%%
%  /reportPDFstructure
%%%%%%%%%%%%%%%%%%%%
%  High level code that finds key objects and reports on them, ending
%  with /pagearray and /numberedpagearray page content lists.
%%%%%%%%%%%%%%%%%%%%

/reportPDFstructure {

(\n\nPDF page content extraction for file )    % title shows source PDF file
diskfilesourcefilename mergestr (...\n\n) 
mergestr print

getobjlist                                     % get list of objects
getcatobj                                      % find the catalog object
getpagesobj                                    % find the top pages object
getcontentarray                                % find the page contents
(\n\n\n\n) print
 (Content objects are...\n ) print             % report contents arrays  
pagearray ==
  (\n) print flush
numberedpagearray ==
(\n\n\n\n) print
} store


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Utilities that follow are specific to word frequency extraction and analysis.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%
%  /expandline
%%%%%%%%%%%%%%%%%%%%
%  extracts string objects from page content. Then uses subroutines
%  to force lower case and then get individual words from the string.
%%%%%%%%%%%%%%%%%%%%

/expandline { 1000 {token {dup type                 % get strings only
             (stringtype) eq {forcelowercase        % force lower case 
              getwordsfromstring                    % extract individual words
                             }{pop} ifelse }        % pop nonstrings
                          {exit} ifelse             % exit when no more tokens
                  } repeat } store


%%%%%%%%%%%%%%%%%%%%
%  /forcelowercase
%%%%%%%%%%%%%%%%%%%%
%  Removes all upper case characters from a string of words.
%%%%%%%%%%%%%%%%%%%%

/forcelowercase { mark exch {                       % start a CHR$ array                       
                             dup dup 65 ge          % If "A" or more
                             exch 90 le and         % Or "Z" or less
                             {32 add} if            % change to lower case         
                             } forall   
                             32                     % add trailing space
                             ] makestring } store   % and convert array to string

%%%%%%%%%%%%%%%%%%%%
%  /getwordsfromstring
%%%%%%%%%%%%%%%%%%%%
%  Extracts raw words from a string by searching on a space delimiter.
%%%%%%%%%%%%%%%%%%%%

/getwordsfromstring {  

    30000 { ( ) search { exch pop  procword }       % extract raw words 
          { pop exit} ifelse } repeat } store      

%%%%%%%%%%%%%%%%%%%%
%  /removetrailingpunct
%%%%%%%%%%%%%%%%%%%%
%  Attempts to remove any trailing punctuation from a word string.
%%%%%%%%%%%%%%%%%%%%

% removal of elipses or true closing quotes are apparently tricky because of
% the encoding vector. WinANSIEncoding is assumed here. 133 is ellipses, 
% 147 left double quote, 148 right double quote. Hyphens are currently left 
% as is and not further processed yet. Slashed parenthesis are also not yet 
% dealt with and left as is.

/removetrailingpunct { dup length 4 ge {            % only for four chars or more

                       dup dup length 1 sub get     % grab the last char 
                       /curlastchar exch store

                       curlastchar 46 eq            % period?
                       curlastchar 44 eq or         % comma?
                       curlastchar 58 eq or         % semicolon?
                       curlastchar 59 eq or         % colon? 
                       curlastchar 32 eq or         % space?
                       curlastchar 34 eq or         % crude quote?
                       curlastchar 148 eq or        % true closing quote?
                       curlastchar 133 eq or        % elipses?
                       curlastchar 63 eq or         % question mark?

                      {dup length 1 sub 0 exch      % shorten by one char if needed     
                       getinterval} if
                                       } if         % more than 4 chars
                       } store

%%%%%%%%%%%%%%%%%%%%
%  /removeleadingpunct
%%%%%%%%%%%%%%%%%%%%
%  Attempts to remove any leading punctuation from a word string.
%  Currently only removes crude quotes or true opening quotes.
%%%%%%%%%%%%%%%%%%%%

/removeleadingpunct { dup length 4 ge {              % only for four chars or more
                      dup 0 get                      % grab the first character
                      /curfirstchar exch store

                      curfirstchar 34 eq             % crude quote?
                      curfirstchar 147 eq or         % true opening quote?
           
                      {dup length 1 sub 1 exch       % shorten if found
                      getinterval} if
                                       } if          % more than 4 chars
                      } store


%%%%%%%%%%%%%%%%%%%%
%  /procword
%%%%%%%%%%%%%%%%%%%%
%  Higher level service routine cleans up words, sends results to /gotone.
%%%%%%%%%%%%%%%%%%%%

/procword {   3 { removetrailingpunct } repeat        % remove trailing punctuation
                 removeleadingpunct                   % remove leading punctuation

               dup length 3 gt                        % if remainin word > 3 chars   
               {dup == gotone}{pop} ifelse            % process further.
                       } store


%%%%%%%%%%%%%%%%%%%%
%  /procword
%%%%%%%%%%%%%%%%%%%%
%  Places word in worddict. If a new word, creates a new name entry and a 
%  [/wordname 1] array. If a previously found word, bumps the count.
%  Requires previously defined worddict.
%%%%%%%%%%%%%%%%%%%%

/gotone {cvn /curword exch store                      % convert word to name
         worddict curword known                       % is this name known?

           { worddict dup curword get 1 add           %    yes - bump count
             curword exch put  }

           {worddict begin curword 1 def end          %     no - enter new
             } ifelse
         } store 

%%%%%%%%%%%%%%%%%%%%
%  /worddict
%%%%%%%%%%%%%%%%%%%%
%  Defines dictionary used to store [/wordname count] word frequency arrays.
%%%%%%%%%%%%%%%%%%%%

/worddict 200 dict store

%%%%%%%%%%%%%%%%%%%%
%  /reportworddict
%%%%%%%%%%%%%%%%%%%%
%  Reads the word dictionary, sorts by frequency, then reports frequency to log file.
%  Only words used more than /minusecount times are currrently reported.
%%%%%%%%%%%%%%%%%%%%

/minusecount 1 store                                     % min frequency to report

/reportworddict {  
         (\nFiltered word frequency in document: \n\n)   % pretty print header 
         print flush

         mark  worddict  { exch /cn exch store           % for each worddict entry
                           /cv exch store                % save name and frequency
                           mark cn 100 string cvs cv ]  
                          } forall ]  
         popbubblesort2                                   % sort results by frequency

        { dup 1 get minusecount  ge {formattedprint}     % print to log if enough use
            {pop} ifelse    
        } forall

               } store

%%%%%%%%%%%%%%%%%%%%
%  /formattedprint
%%%%%%%%%%%%%%%%%%%%
%  Arranges word frequency results to be sent to log file.
%%%%%%%%%%%%%%%%%%%%

/formattedprint { dup 0 get /wordx exch store        % grab word and count
                  1 get /num exch store

                 (\n) wordx mergestr ( - )           % format line as "word - nn"
                 mergestr num 10 string cvs 
                 mergestr print flush
                } store

%%%%%%%%%%%%%%%%%%%%
%  /pagecounter
%%%%%%%%%%%%%%%%%%%%
%  A convenience variable used to report words by page to log file
%%%%%%%%%%%%%%%%%%%%

/pagecounter 0 store


%%%%%%%%%%%%%%%%%%%%
%  /analyzewordfrequency
%%%%%%%%%%%%%%%%%%%%
%  A higher level convenience operator reads /Page /Content lines, extracts
%  strings, extracts words, filters words, and adds them to /worddict.
%%%%%%%%%%%%%%%%%%%%

/analyzewordfrequency {pagearray { getobj            % get all /Page objects 

             (Filtered words in page ) pagecounter   % pretty print header
             20 string cvs mergestr
             (:\n\n) mergestr print
              /pagecounter pagecounter 1 add store   % bump page counter
             {expandline} forall                     % read lines             
                  (\n\n\n) print } forall            % add spaces between pages
                      } store

%%%%%%%%%%%% demo - remove or alter before reuse %%%%%%%%%%%%%

reportPDFstructure              % find the .PDF structure and make a page array
analyzewordfrequency            % analyze the word frequency
reportworddict                  % report word frequency results

(\n\n\n\n) print flush          % pretty print end notes



% EOF