%!

%   ACROBAT CATALOG WORD LIST EXTRACTOR
%   ===================================
%   by  Don Lancaster  v1.4  April 8, 1997

%   Copyright c. 1997 by Don Lancaster and Synergetics, Box 809,
%   Thatcher AZ, 85552 (520) 428-4073. synergetics@tinaja.com
%   All commercial rights and all electronic media rights *fully*
%   reserved. Linking welcome. Reposting is expressly forbidden.

%   Further support on http://www.tinaja.com
%   Consulting services available via don@tinaja.com

%   ====================================

%   WARNING: Preliminary and partial code. Use at your own risk.
%            Report all problems to don@tinaja.com
%            *NOT* an official Verity or Adobe document.
%            Only warranty is "approximate quantity one"

%   ====================================

%  This PostScript-as-language routine extracts the word list from
%  an Acrobat Catalog and returns it to a textfile of your choosing.

%  This reads the -entire- included word list. Including words that 
%  may have been dropped on catalog revisions or updates.

%  Background tutorial information appears in ACATDATA.PS and 
%  DISTLANG.PS on http://www.tinaja.com/acrob01.html

%  The code only has been tested on version vdk103.dll and Style 
%  3.0  ../Style/style.did  Acrobat Catalog. For "medium" sized
%  catalogs. Circa spring of 1997

%  First, read this file as a textfile in an editor or word processor.
%  Next, alter the filenames to meed your needs. Finally, send the
%  file to Acrobat Distiller or GhostScript to generate your word list.

%  Note that a "NO PDF FILE PRODUCED" error is normal and expected.


%%%%%%%%%%%%%%%%  (A) CATALOG KEYWORD EXTRACTOR %%%%%%%%%%%%%%%%%%

(\n\nBeginning keyword file read) print flush    % speed monitor

% ALWAYS USE "\\" WHEN YOU MEAN "\" IN THE FILENAME STRINGS!!!!!

% Place the exact full .DID filename you wish to expand in the /grabfilename
% string. In the case of Acrobat Catalog, you will usually find this file
% in the INDEX\PARTS\folder of the cataloged documents. The highest 
% number files are the latest.

/grabfilename 
(c:\\Windows\\Desktop\\ACROBA~1\\INDEX\\PARTS\\00000000.DID) 
def % data file source

% Place the exact full filename of where you wish to put the data
% file examination results in the /dumpfilename string.

% WARNING: If a similar filename already exists. it **WILL** be deleted!

/dumpfilename 
(c:\\Windows\\Desktop\\Nucat\\rawwords)
 def % source


(dumpfilename) status                     % ERASE existing write file
[/pop cvx
 /pop cvx
 /pop cvx
 /pop cvx (dumpfilename)                  % to allow creation of a new
   /deletefile cvx]cvx if                 % one, rather than appending.

grabfilename (r) file /source exch store  % create source file object
dumpfilename (w) file /sink exch store    % create sink file object


% /grabindex reads the main directory and saves it...


/grabindex { source fileposition                % save old position
             /oldposn exch store  
             source 0 setfileposition           % go to beginning
             source 1000 string readstring      % grab directory
             {/dirindex exch store}             % and save
             {could_not_read_directory} ifelse
             source oldposn setfileposition     % restore file posn
            } def

% /getmore grabs sequential pages of 512 addresses...

/getmore {



15 1 22 {source fileposition          % save fileposition
                   /oldposn exch store

                   2 mul dup wdir0 exch get     % get page address
                   exch 1 add wdir0 exch get
                   256 mul add 1024 mul
                   source exch setfileposition

                   source 1024 string readstring % read page of addresses

                   {/pdirhold exch store
                   0 1 255 {2 mul dup  pdirhold exch get
                            exch 1 add pdirhold exch get
                            256 mul add 1024 mul 
                 
                            dup 0 eq {exit} if

          
} for
                                 }
                   {could_not_read_directory_page} ifelse

               
dup 0 eq {pop exit} if

}for

}def


% /readallwords extracts the word information one 1024 byte segment at
% a time and places it in your target file. This method avoids problems
% at 1024 byte boundaries...

/readallwords {save /rawsnap exch store mark      % save file positions
               
              wordsegs                            % for each page
              {source exch setfileposition
              source 1024 string readstring       % read that page
              {sink exch writestring}             % and write to output
              {could_not_read_1024_bytes}ifelse   % error trap
              } forall

               cleartomark rawsnap restore        % restore states
               } def


% /makeworddir maps the data pages for the diw word list

/makeworddir{ dirindex (diw) search { pop pop   % extract page bytes
                  9 58 getinterval              % remember the div!
                 /wdir0 exch store              % temp save
                 mark                           % start disk pile
                 wdir0 0 get                    % get base address
                 wdir0 1 get 256 mul add 
                 1024 mul                 
                 dup 0 eq                       % error check
                   {no_word_page_map_start} if 

                 1 1 14 {2 mul dup              % read 14 more locs         
                  wdir0 exch get exch           % get file posn value
                  wdir0 exch 1 add get
                  256 mul add 1024 mul
                  dup 0 eq {exit}if             % flag if continue
                   } for

                 dup 0 gt {getmore}{pop}ifelse  % another page of locs?                        
                 
                  ]  /wordsegs exch store       % save base directory
                  }
                {unable_to_find_diw_directory}  % error trapper
                  ifelse
                } def


% /grabcatwords is the high level routine that does the entire job...

/grabcatwords { save /cwsnap exch store   % save machine state
                grabindex                 % read and save catalog index
                makeworddir               % make word file directory
                readallwords              % and copy to your file
                source closefile          % clean up
                sink closefile
                cwsnap restore            % restore machine state
               } def

% This is the main catalog grabbing service routine. Make sure you
% have entered your filenames correctly above! Or repeat them here.

grabcatwords                                % grab the catalog words


%%%%%%%%%%%% (B) CATALOG NAME LISTING FORMAT IMPROVER  %%%%%%%%%%%%%%

% The "raw" word list extracted from Acrobat Catalog consists of 
% ASCII words in alphabetical order separated by nulls. This routine
% makes the output listing prettier and more word processor friendly.

% It operates *only* on your output file and requires no Catalog input.
% But it *does* need the full catalog file filname from above.

% Allow ten or more seconds for reformatting. This can be dramatically
% sped up by going to more obtuse and non-tutorial code.

(beginning reformatting) print flush

% ALWAYS USE "\\" WHEN YOU MEAN "\" IN THE FILENAME STRINGS!!!!!

% Place the exact raw word list filename you wish to expand in the 
% /grabfilename1 string. It is located wherever you just put it.


/grabfilename1
(c:\\Windows\\Desktop\\Nucat\\rawwords)
 def % source

% Place the exact full filename of where you wish to put reformatted
% word list in the /dumpfilename string.

% WARNING: If a similar filename already exists. it **WILL** be deleted!

/dumpfilename1
(c:\\Windows\\Desktop\\Nucat\\fmtwords)
 def % source

(dumpfilename1) status                     % ERASE existing write file
[/pop cvx
 /pop cvx
 /pop cvx
 /pop cvx 
 /pop cvx (dumpfilename1)                  % to allow creation of a new
 /deletefile cvx] cvx if                   % one, rather than appending.


grabfilename1 (r) file /source exch store  % create source file object
dumpfilename1 (w) file /sink exch store    % create sink file object


% Usual choices for word separators are ( ) space or (\n) newline...

/replacechar (\n) def              % word separator character

 [
(\n)
(\nDon Lancaster's key word extractor for Acrobat Catalog v1.3)
(\n===========================================================\n)
(\nVersion 1.3 April 1997  copyright 1997 by Don Lancaster & Synergetics)
(\n3860 West First Street Box 809, Thatcher, AZ, 85552 (520) 428-4073) 
(\nAdditional support on http://www.tinaja.com)
(\nConsulting services available at (520) 428-4073 or don@tinaja.com)
(\n)
(\nReformatted indexed word list from\n )
 grabfilename
(\n\n)
 ]

        {sink exch writestring}forall           % write info header

        source 32 string readstring             % dump 32 leading nulls
             {pop}
             {Can_not_read_file} ifelse

        {source (X) readstring                  % main service loop
         {dup (Y) dup 0 0 put eq                % test for null  
           {pop replacechar} if
           sink exch writestring}
         {exit} ifelse                          % repeat till file end
         } loop 
      
                source closefile                % clean up
                sink closefile 


%%%%%%%%%%%%%%%%%%%%%%%% end utilities %%%%%%%%%%%%%%%%%%%%%%%%%%%


%   ====================================
%   Copyright c. 1997 by Don Lancaster and Synergetics, Box 809,
%   Thatcher AZ, 85552 (520) 428-4073. synergetics@tinaja.com
%   All commercial rights and all electronic media rights *fully*
%   reserved. Linking welcome. Reposting is expressly forbidden.

%   Further support on http://www.tinaja.com
%   Consulting services available via don@tinaja.com

%   ====================================