# HomepageSearchEngine cronjob script for Unix (last updated on 2006-11-17) # All lines beginning with "#" are comments. # This shell script can be used to: # (1) optionally convert all supported PDF files under your home directory into plain text format # (2) optionally spider one or more entire sites and create the URL-list required to grab URLs (only required if you will perform step 3) # (3) optionally grab the content of remote URLs to your site # (4) make the file-list required for the flat or the indexed search method # (5) index your site # (6) optionally rename the URLs in the indexes back to their original ones (only required if you have performed step 3) # (7) optionally do some additional tasks # Details can be found in chapter 7.1 ("Spidering and URL Grabbing: Searching of any sites") of the Manual (ReadMe.txt) # # This script is an example for having 3 categories: # As category 1, we want to search the English site "www.site1.tld". This could be our own website, probably containing dynamical content. # As category 2, we want to search the German site "www.site2.tld". This could be another company site, hosted elsewhere. # As third category, we want to search both the above sites at once. # Since these sites are encoded in the same Character Encoding (iso-8859-1), we can use the same configuration set for both. # # Make sure your "hse.ini" file contains something like this: # # basepath = /home/myusername/htdocs # # categories_nr = 3 # # categories_name1 = www.site1.tld in English # categories_name2 = www.site2.tld in German # categories_name3 = all the above sites # # categories_dir1 = hse/_sites/en/www.site1.tld # categories_dir2 = hse/_sites/de/www.site2.tld # categories_dir3 = # # categories_source1 = # categories_source2 = # categories_source3 = echo echo "`date +'%Y-%m-%d %H:%M:%S'` Starting building search index" echo DIR=`pwd` # Edit the following path to point to your hse directory! cd /home/myusername/cgi-bin/hse # (1) determining unsupported PDF files and converting supported PDF files into plain text format echo echo "Now performing step 1: converting PDFs" echo # Ensure DIR is set to your home directory (or the directory PDF files you want to make searchable are residing under) DIR="/path_to/myhomedir" ./HomepageSearchEngine.cgi pdfconvert -dir=$DIR 2> $DIR/unsupported_pdfs.txt # (2) automatically generate the URL-list files from all sites echo echo "Now performing step 2: spidering" echo # Ensure each SITE value is set properly. Then, only the "-cat" and "-lang" options have to be checked: SITE="www.site1.tld" ./HomepageSearchEngine.cgi spider -cat=1 -lang=en -pdf2txt -url=http://$SITE/ -nobackup -batchmode SITE="www.site2.tld" ./HomepageSearchEngine.cgi spider -cat=2 -lang=de -pdf2txt -url=http://$SITE/ -nobackup -batchmode # (3) grab the content of the URLs listed in the URL-list files echo echo "Now performing step 3: grabbing the URLs' contents" echo ./HomepageSearchEngine.cgi geturls -cat=1 -lang=en -nobackup -batchmode ./HomepageSearchEngine.cgi geturls -cat=2 -lang=de -nobackup -batchmode # (4) make the file-list file pairs echo echo "Now performing step 4: making the file-list" echo ./HomepageSearchEngine.cgi makelist -cat=1 -nobackup -batchmode ./HomepageSearchEngine.cgi makelist -cat=2 -nobackup -batchmode # (5) create the index file pairs echo echo "Now performing step 5: indexing" echo ./HomepageSearchEngine.cgi index -cat=1 -nobackup -nocheck -batchmode ./HomepageSearchEngine.cgi index -cat=2 -nobackup -nocheck -batchmode # (6) change the URLs in the indexes back to their original ones echo echo "Now performing step 6: changing the URLs' names" echo ./HomepageSearchEngine.cgi changeurls -cat=1 -nobackup -batchmode ./HomepageSearchEngine.cgi changeurls -cat=2 -nobackup -batchmode # (7) do some additional tasks echo echo "Now performing step 7: doing some additional tasks" echo # Provide the index for category 3 by merging the indexes from category 1 and 2. # First, clean up category 3's index: rm -f hse_index3_*.txt cat hse_index?_html.txt > hse_index3_html.txt cat hse_index?_nonhtml.txt > hse_index3_nonhtml.txt # Provide category 3's index also as main index (used if HomepageSearchEngine is called without a "cat" delivery parameter) - # So make a symbolic link of it (which does not waste disc space): rm -f hse_index_*.txt ln -sf hse_index3_html.txt hse_index_html.txt ln -sf hse_index3_nonhtml.txt hse_index_nonhtml.txt # Finally, we change the current directory back to the original one: cd $DIR echo echo "`date +'%Y-%m-%d %H:%M:%S'` Finished building search index" echo