%!

%   ACROBAT CATALOG DATA MANIPULATION TOOLS
%   =======================================
%   by  Don Lancaster  v1.4  April 8, 1997

%   Copyright c. 1997 by Don Lancaster and Synergetics, Box 809,
%   Thatcher AZ, 85552 (520) 428-4073. synergetics@tinaja.com
%   All commercial rights and all electronic media rights *fully*
%   reserved. Linking welcome. Reposting is expressly forbidden.

%   Further support on http://www.tinaja.com
%   Consulting services available via don@tinaja.com

%   ====================================

%   WARNING: Preliminary and partial code. Use at your own risk.
%            Report all problems to don@tinaja.com
%            *NOT* an official Verity or Adobe document.
%            Only warranty is "approximate quantity one"

%   ====================================

%  These PostScript-as-language routines let you extract relevant data
%  from an Acrobat cataloging of a folder of documents. These utilities
%  let you do such things as find word frequencies, provide author
%  improvement tools, or create a hard copy index.

%  Background tutorial information appears in ACATDATA.PS and 
%  DISTLANG.PS on http://www.tinaja.com/acrob01.html Also see
%  CATWORDS.PS as a simpler word list extraction demo.

%  A specific example of catalog word frequency extraction is included.

%  The code only has been tested on version vdk103.dll and Style 
%  3.0  ../Style/style.did  Acrobat Catalog. For "medium" sized
%  catalogs. Circa spring of 1997

%  First, read these files as textfiles in an editor or word processor.
%  Next, alter the filenames to meet your needs.  Then collect the
%  utilities together and extend them to perform your needed task.
%  Finally, send the file to Acrobat Distiller or GhostScript.

%  Note that a "NO PDF FILE PRODUCED" error is normal and expected.


%%%%%%%%%%%%%%%%  (A) DEFINE CAT UTILITY FILENAMES %%%%%%%%%%%%%%%%%%


% ALWAYS USE "\\" WHEN YOU MEAN "\" IN THE FILENAME STRINGS!!!!!

% Put the exact full .DID filename you wish to access in the /didfilename
% string. In the case of Acrobat Catalog, you will usually find this file
% in the INDEX\PARTS\folder of the cataloged documents. The highest 
% number files are the latest...

/definefilenames {                                   % begin proc

/didfilename 
(c:\\Windows\\Desktop\\ACROBA~1\\INDEX\\PARTS\\00000000.DID) 
def % data file source

% Put the exact full .DDD filename you wish to access in the /dddfilename
% string...

/dddfilename 
(c:\\Windows\\Desktop\\ACROBA~1\\INDEX\\PARTS\\00000000.DDD) 
def % data file source


% Several intermediate data files are generated to simplify access.
% These files should be placed in a work folder of your choice. Note
% that if a previous file having the same name exists in the same
% folder, that it will be deleted. This starts a new file, rather than
% appending. Note also that these files are LEFT IN PLACE after program
% completion.

% Put the full name of a work file ending in catwords.txt here. This
% file will hold the keyword list. 

/wordsfilename 
(c:\\Windows\\Desktop\\Nucat\\catwords.txt) 
def % data file source

% Put the full name of a work file ending in catmaps.txt here. This
% file will hold the mapping of keywords to documents.

/mapsfilename 
(c:\\Windows\\Desktop\\Nucat\\catmaps.txt) 
def % data file source

% Put the full name of a work file ending in catlinks.txt here. This
% file holds the links between keywords and their mappings.

/linksfilename 
(c:\\Windows\\Desktop\\Nucat\\catlinks.txt) 
def 

% Put the full name of a work file ending in catdocs.txt here. This 
% file holds the document names and related information...

/docsfilename 
(c:\\Windows\\Desktop\\Nucat\\catdocs.txt) 
def 

% Put the full name of a work file ending in catpage.txt here. This file
% reates document page numbers to word position counts...

/pagesfilename 
(c:\\Windows\\Desktop\\Nucat\\catpage.txt) 
def 


% Put the full name of your first output file here. This file
% can have any name you want...

/out1filename 
(c:\\Windows\\Desktop\\Nucat\\mainout.txt) 
def 

% Put the full name of an optional second output file here. This file
% can have any name you want...

/out2filename 
(c:\\Windows\\Desktop\\Nucat\\auxout.txt) 
def 

      } def                           % end definefilenames proc


%%%%%%%%%%%%%% (B)  CREATE FILE OBJECTS %%%%%%%%%%%%%%%%%%%%%%%



% /makefileobjects creates file objects for access. It also erases
% any previous work files having exactly similar full filenames...

/makefileobjects {                        % begin makefileobjects proc

% create input file objects...

dddfilename (r) file /dddf exch store     % make ddd read file object
didfilename (r) file /didf exch store     % make did read file object



% create the new work and output file objects...

    % work file objects...

         wordsfilename (w+) file          % words file object
         /words exch store 

         mapsfilename (w+) file           % maps file object
         /maps exch store

         linksfilename (w+) file          % links file object
         /links exch store 

         docsfilename (w+) file           % docs file object
         /docs exch store

         pagesfilename (w+) file          % pages file object
         /pages exch store
 
   % output file objects...

         out1filename (w+) file           % first output file object
         /out1 exch store

         out2filename (w+) file           % aux output file object
         /out2 exch store

   } def                                  % complete

% /closeallfiles is used for end cleanup. Must match above!

/closeallfiles { [didf dddf words maps links docs pages out1 out2 ]
                 {closefile} forall} def


%%%%%%%%%%%%%%% (C) READ DID AND DDD DIRECRTORIES %%%%%%%%%%%%%%%

% /readddd&did extracts the directory blocks from the Catalog .DDD
% and .DID files.

/readddd&did {  dddf 1024 string readstring    % grab ddd directory 
                {/ddddir exch store}           % and save
                {Can't_read_ddd_directory}     % error trap
                ifelse

                didf 1024 string readstring    % grab did directory
                {/diddir exch store}           % and save
                {Can't_read_did_directory}     % error trap
                ifelse
                 } def


 %%%%%%%%%%%%%% (D) MAP AND EXTRACT KEY DATA FILES %%%%%%%%%%%%%%%

% Useful portions of the DID and DDD files are extracted into individual
% data files containing words, maps, links, docinfo, and pages. Saving
% to new files adds flexibility and eliminates reading problems on 
% sector boundaries. 

% /listsectors reads directories to extract sequential sector blocks
% for a given data file. Requires indexname and (XXX) subfile name and
% did or ddd filename onstack; exits with array of sector start addresses.

% The output format is an array [[segn segnlen][seg0 seg1 ... segn-1]]              

/listsectors {/dfs exch store                   % hold data file source
               search { pop pop                 % extract page bytes


               dup 1 4 getinterval              % read filelength
               {} forall
               16777216 mul exch
               65536 mul add exch
                 256 mul add add
                /lenhold exch store

                  9 58 getinterval              % remember the div!
                 /wdir0 exch store              % temp save

                 mark mark                      % start data arrays
                
                 wdir0 0 get                    % get base address
                 wdir0 1 get 256 mul add 
                 1024 mul                 
                 dup 0 eq                       % error check
                   {no_word_page_map_start} if 

                 1 1 14 {2 mul dup              % read 14 more locs         
                  wdir0 exch get exch           % get file posn value
                  wdir0 exch 1 add get
                  256 mul add 1024 mul
                  dup 0 eq {exit}if             % flag if continue
                   } for

                 dup 0 gt {getmore}{pop}ifelse  % another page of locs?        
                 }
                {unable_to_find_xxx_directory}  % error trapper
                  ifelse

                /lastseg exch store  ]          % save last seg; array rest
                dup length                      % number of 1024 segs
             
                mark  exch lastseg exch         % start lastseg info array
                1024 mul lenhold exch sub       % length of last seg

                dup dup 1024 ge exch 0 le or    % error trap
                {filename_length_read_error} if

                 ]  exch ]                      % complete array pair

                } def


% /getmoresegs grabs sequential sectors of 512 addresses...

/getmore {15 1 22 {dfs fileposition             % save fileposition
                   /oldposn exch store

                   2 mul dup wdir0 exch get     % set sector address
                   exch 1 add wdir0 exch get
                   256 mul add 1024 mul
                   dfs exch setfileposition

                   dfs 1024 string readstring   % read page of locs

                   {/pdirhold exch store
                   0 1 511 {2 mul dup  pdirhold exch get
                            exch 1 add pdirhold exch get
                            256 mul add 1024 mul 
                 
                            dup 0 eq {exit} if       % till zero address
                    } for}
                   {could_not_read_directory} ifelse % error trap

                   dup 0 eq {pop exit} if            % no more segs
           }for }def

% /findallsectors finds the base address for each sector in every needed
% reference file...

/findallsectors {                               % begin proc
               diddir (diw) didf                % find word sectors
               listsectors                      % pile on stack  
               /wordsegs exch store             % and save

                diddir (div) didf                % find list sectors
                listsectors                      % pile on stack         
                /mapsegs exch store              % and save

               diddir (dif)  didf               % find link sectors
               listsectors                      % pile on stack        
               /linksegs exch store             % and save

               ddddir (xya)  dddf               % find doc sectors
               listsectors                      % pile on stack         
               /docsegs exch store              % and save

               ddddir (xyb)  dddf   %rong       % find name sectors
               listsectors                      % pile on stack        
               /pagesegs exch store             % and save

               } def

% /extractfiles separates the words, maps, links, docs, and names into
% separate and standalone files. It reads the sector files for
% sequential sectors, then adjusts the last sector for file length.

% /extractfiles generates isolated files for words, maps, links, 
% docs, and names. These will be only somewhat less long than the
% original .DDD and .DID files. But all data is isolated and free
% from segment boundary hassles.  The full segments are first written
% in a forall loop, followed by the partial final segment...

/extractfiles {

     wordsegs 1 get {didf exch setfileposition      % set base address
                     didf 1024 string readstring    % grab 24 chars
                    {words exch writestring}        % write to output
                    {Can't_map_wordfile_to_disk}    % error trap
                    ifelse} forall

     wordsegs 0 get aload pop exch                  % last base and length
                    didf exch setfileposition       % set base
                    didf exch string readstring     % read length
                    {words exch writestring}        % error trap    
                    {can't_map_last_wordfile_seg}   % write to output
                    ifelse


      mapsegs 1 get {didf exch setfileposition      % for maps file
                     didf 1024 string readstring    % as above
                    {maps exch writestring}
                    {Can't_map_wordfile_to_disk}
                    ifelse} forall

      mapsegs 0 get aload pop exch 
                    didf exch setfileposition
                    didf exch string readstring
                    {maps exch writestring}
                    {can't_map_last_wordfile_seg}
                    ifelse

     linksegs 1 get {didf exch setfileposition       % for links file    
                     didf 1024 string readstring     % as above
                    {links exch writestring}
                    {Can't_map_wordfile_to_disk}
                    ifelse} forall

     linksegs 0 get aload pop exch                   % for segs file
                    didf exch setfileposition        % as above
                    didf exch string readstring
                    {links exch writestring}
                    {can't_map_last_wordfile_seg}
                    ifelse

      docsegs 1 get {didf exch setfileposition       % for docs file
                     didf 1024 string readstring     % as above
                    {docs exch writestring}
                    {Can't_map_wordfile_to_disk}
                    ifelse} forall

      docsegs 0 get aload pop exch 
                    didf exch setfileposition
                    didf exch string readstring
                    {docs exch writestring}
                    {can't_map_last_wordfile_seg}
                    ifelse

     pagesegs 1 get {didf exch setfileposition       % for pages file
                     didf 1024 string readstring     % as above
                    {pages exch writestring}
                    {Can't_map_wordfile_to_disk}
                    ifelse} forall

     pagesegs 0 get aload pop exch 
                    didf exch setfileposition
                    didf exch string readstring
                    {pages exch writestring}
                    {can't_map_last_wordfile_seg}
                    ifelse
} def

%%%%%%%%% (E) EXTRACT WORD OCCURRENCE INFO %%%%%%%%%%%%%%%%%

% /getworddata accepts a sequential word number on the stack. It 
% looks in the links file, then reads the word file to place the
% word on the stack. It then reads the maps file to place the raw
% mapping info on the stack as a string.

/getworddata { links exch 17 mul setfileposition   % find 17 byte record
               links 17 string readstring          % and read it
               {/curlink exch store}               % save as variable
               {could_not_read_link_file} ifelse   % error trap
               
               curlink 2 get 65536 mul             % calculate word start
               curlink 1 get 256 mul add
               curlink 0 get add
               words exch setfileposition

               curlink 3 get 1 sub                 % word length - null
               words exch string readstring        % read word
               {  }                                % leave on stack 
               {unable_to_read_words_file} ifelse  % error trap

               curlink 7 get 16777216 mul          % calculate map start
               curlink 6 get 65536 mul add
               curlink 5 get 256 mul add
               curlink 4 get add
               maps exch setfileposition

               curlink 10 get 65536 mul            % calculate map length
               curlink 9 get 256 mul add
               curlink 8 get add
               maps exch string readstring         % get map info
               { }                                 % leave map str on stack
               {could_not_read_maps_file}ifelse    % error trap


  } def

% /expandmapdata converts the stack top raw map data string
% into a nested array form of...

%      [[ firstfilenum  firstwordinfile secwordinfile ... ]
%       [ secondfilenum firstwordinfile secwordinfile ... ]
%       [ lastfilenum   firstwordinfile secwordinfile ... ]
%      ]    % all values remain RELATIVE to previous.

% Note that this takes many more bytes per record than the original...

/expandmapdata {dup length ()
                 /SubFileDecode filter /mh exch store  % map to file object
                mark mark                % start full and first array
                grabmcvalue              % get initial doc number- 0 ok
                not {map_read_error}if

                {grabmcvalue              
                  not {] 
                  dup length 0 eq {pop} if  % clip null data
                  ] exit}                 % finish array on data end
                {dup 0 eq {pop ][} if     % enter number or close
                  } ifelse                % value or close

                } loop                    % keep going till end of data

} def

% /grabmcvalue is a mod 128 value extractor. If under 128 return as is.
% If > 128 subtract 128 and get next value for up to three values...
% Returned stack format is -value- -true- if readable and -false- if not.


/grabmcvalue { mh read 
                {                         % do only if true=data
               dup 128 lt {true}          % exit with single byte value

               {128 sub                   % find real lsb value
                mh read not               % read second data byte

                {map_format_error}if      % could not read second data byte
 
               dup 128 lt                 % test for three byte read
                  {128 mul add true}      % exit with dual byte value

                  {128 sub                % find real second byte value
                   mh read not            % read third data byte
                   {map_format_error} if  % could not read third data byte

               dup 128 lt                 % evaluate third byte
                   {16384 mul exch
                    128 mul add add true} % output third byte
                   {map_data_value_too_big}
                   ifelse                 % if value too high


   } ifelse                               % three byte value
   } ifelse                               % two byte value
   }
   {false} ifelse                        % valid vs end data
              } def



%%%%%%%%% (F) COUNT TOTAL WORD OCCURRANCES %%%%%%%%%%%%%%%%%

% /findtotalwords divides the map file count by 17 to get the
% total unique words in the index...

/findtotalwords {/totalwords links fileposition 
                 17 div floor cvi def} def

% /countworduse converts the stack top expandmapdata array of form...

%      [[ firstfilenum  firstwordinfile secwordinfile ... ]
%       [ secondfilenum firstwordinfile secwordinfile ... ]
%       [ lastfilenum   firstwordinfile secwordinfile ... ]
%      ]

%     ...into a single total word count numeral

/countworduse { 0 exch  {length 1 sub add} forall} def

% mergestr merges the two top stack strings into one top stack string

/mergestr {2 copy length exch length add string dup dup 4 3 roll
4 index length exch putinterval 3 1 roll exch 0 exch putinterval} def


% /counttoascii takes an array of form [(wordname) wordcount] and 
% converts it to a single ascii text line of  wordname -sp- wordcount -cr-

/counttoascii { dup 0 get( ) mergestr exch 1 get 6 string cvs 
                mergestr (\n) mergestr} def


%%%%%%%%%%%%%%%%%%%%%%%% end utilities %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%% EXAMPLE I: REPORT CATALOG WORD FREQUENCY OF USE %%%%%%%%%%%%%%

% This example routine uses many of the above tools to read internal
% Acrobat catalog files and generate a report of the word frequencies
% of usage. This is very useful for restricting keywords, as an 
% independent spelling checker, and as an author quality improvement tool.

% Send this code to Acrobat Distiller or Ghostscript.
% Results are ascii text written in alphabetical order to out1filename.
% This file may then be viewed and modified with any editor or wp.


/reportover 10 def                   % default low count limit

/catfreqs { definefilenames          % enter filenames above or replace
            makefileobjects          % set up system files
            readddd&did              % read internal catalog files
            findallsectors           % calculate internal base addresses
            extractfiles             % extract data files
            findtotalwords           % find total unique words in catalog


      out1                                           % write header
     (\nSynergetics word frequency extractor for Acrobat Catalog.\n)
     writestring
     out1
     (=========================================================\n\n)
     writestring 
     out1
     (Copyright c 1997 by Don Lancaster & Synergetics, Box 809,\n)
     writestring
     out1

     (Thatcher, AZ 85552. (520) 428-4073. don@tinaja.com Support and\n)
     writestring
     out1
     (full consulting services at http://www.tinaja.com/acrob01.html\n\n)
     writestring

     out1 
     (All commercial and electronic media rights *FULLY* reserved.\n)
     writestring
     out1
     (This is NOT an official Adobe or Verity product.\n)
     writestring
     out1
     (Experimental code -- Use at your own risk\n)
     writestring
     out1                        
     (\n\nWord frequency usage of )writestring
     out1
     didfilename 100 string cvs writestring
     out1
     (\n\nTotal unique words in catalog: ) writestring
     out1
     totalwords 7 string cvs writestring 
     out1
     (.\n) writestring
     out1
     (Numbers and words used more than ) writestring
     out1
     reportover 6 string cvs writestring
     out1
     ( times in catalog:\n\n) writestring


   0 1 totalwords 1 sub {                 % begin processing loop

            dup 1000 mod 0 eq             % optional progress reporter
            {(.) print flush} if

                         getworddata      % get word and mapping
                         mark 3 1 roll    % start word-count array
                         expandmapdata    % make list of words in docs
                         countworduse     % convert to number
                         ]                % complete word-count array
 
            dup 1 get reportover ge {     % write only high use words
            counttoascii                  % array to string
            out1 exch writestring}        % string to file
              {pop} ifelse                % if low use word

                        } for            % for all words
        
       out1                              % trailer
       (\n    <end of list>\n\n) 
       writestring 

       } def   

% Make sure filenames are correct above.
% Allow three minutes for 30,000 unique words in 125 six page docs.
% This actually does it...

/reportover 1000 def                    % set lower word count limit

catfreqs                                % generate word count ascii file
closeallfiles                           % close active files


% A typical out1 textfile should look like this...

%%
%% Synergetics word frequency extractor for Acrobat Catalog.
%% =========================================================
%%
%% Copyright c 1997 by .... 
%%
%%        ( more header stuff)
%%
%% Word frequency usage of
%% c:\Windows\Desktop\ACROBA~1\INDEX\PARTS\00000000.DID 

%%
%% Total unique words in catalog: 27787.
%% Numbers and words used more than 1000 times in catalog:

%% 520 1003
%% 800 1641
%% A 1414
%% a 7825
%% all 2250
%% an 1278
%% and 8914
%%
%%    (more data here)
%%
%% up 1066
%% with 1277
%% www 1293
%% you 3839
%% your 2303
%%
%%    <end of list>

%%%%%%%%%%%%%%%%%%%%%%%% end examples %%%%%%%%%%%%%%%%%%%%%%%%%%%


%   ====================================
%   Copyright c. 1997 by Don Lancaster and Synergetics, Box 809,
%   Thatcher AZ, 85552 (520) 428-4073. synergetics@tinaja.com
%   All commercial rights and all electronic media rights *fully*
%   reserved. Linking welcome. Reposting is expressly forbidden.

%   Further support on http://www.tinaja.com
%   Additional utilities available on a custom basis.
%   Consulting services available via don@tinaja.com

%   ====================================