# HomepageSearchEngine cronjob script for Unix (last updated on 2008-10-16 by roal) # All lines beginning with "#" are comments. # This shell script is used to: # (1) optionally spider one or more entire sites and create the URL-list required to grab URLs (only required if you will perform step 2) # (2) optionally grab the content of remote URLs to your site # (3) make the file-list required for the flat or the indexed search method # (4) index your site # (5) optionally rename the URLs in the indexes back to their original ones (only required if you have performed step 2) # (6) optionally do some additional tasks # Details can be found in chapter 7.1 ("Spidering and URL Grabbing: Searching of any sites") of the Manual (ReadMe.txt) # # We are using the main configuration set for the English part and the configuration set 1 for the German part of our site. # Both "hse.ini" files contain: # # basepath = ../../htdocs/www.homepagesearchengine.com # # categories_nr = 4 # # The main "hse.ini" file contains: # # categories_name1 = HomepageSearchEngine.com and anet.at in English # categories_name2 = PHP 5 Manual # categories_name3 = Apache 2 Manual # categories_name4 = entire web using Google # # categories_name5 = www.homepagesearchengine.com in English (for internal use only) # categories_name6 = www.anet.at in English (for internal use only) # # categories_dir1 = # categories_dir2 = _/php/en # categories_dir3 = _/apache # categories_dir4 = # categories_dir5 = hse/_sites/en/www.homepagesearchengine.com # categories_dir6 = hse/_sites/en/www.anet.at # # categories_source1 = *.txt *.ini *.rtf /hse/* -*.*html -*_de.* # categories_source2 = # categories_source3 = # categories_source4 = query:'Google' # categories_source5 = # categories_source6 = # # While the "hse.ini" file within the conf/1 directory contains: # # categories_name1 = HomepageSearchEngine.com und anet.at in Deutsch # categories_name2 = PHP 5 Handbuch # categories_name3 = Apache 2 Manual (Englisch) # categories_name4 = gesamtem deutschsprachigen Web mit Google # # categories_name5 = www.homepagesearchengine.com in German (for internal use only) # categories_name6 = www.anet.at in German (for internal use only) # # categories_dir1 = # categories_dir2 = _/php/de # categories_dir3 = _/apache # categories_dir4 = # categories_dir5 = hse/_sites/de/www.homepagesearchengine.com # categories_dir6 = hse/_sites/de/www.anet.at # # categories_source1 = *.txt *.ini *.rtf /hse/* -*.*html -*_en.* # categories_source2 = # categories_source3 = # categories_source4 = query:'Google.de' # categories_source5 = # categories_source6 = DIR=`pwd` # Edit the following path to point to your hse directory! cd ~/web/cgi-bin/hse # (1) automatically generate the URL-list files from all sites echo echo "Now performing step 1: spidering" echo # We want to create indexes of the spidered sites as category 5 and 6, to be used internally only (only the first 4 categories are visible). # Ensure each SITE value is set properly. Then, the "-url" option does not need to be checked: SITE="www.homepagesearchengine.com" ./HomepageSearchEngine.cgi spider -cat=5 -lang=en -url=http://$SITE/ -prerobotsfile=${SITE}_en.txt -nobackup -debug=robotrules -batchmode ./HomepageSearchEngine.cgi spider -conf=1 -cat=5 -lang=de -url=http://$SITE/ -prerobotsfile=${SITE}_de.txt -utf8 -nobackup -debug=robotrules -batchmode SITE="www.anet.at" ./HomepageSearchEngine.cgi spider -cat=6 -lang=en -url=http://$SITE/ -prerobotsfile=${SITE}_en.txt -nobackup -debug=robotrules -batchmode ./HomepageSearchEngine.cgi spider -conf=1 -cat=6 -lang=de -url=http://$SITE/ -prerobotsfile=${SITE}_de.txt -utf8 -nobackup -debug=robotrules -batchmode # (2) grab the content of the URLs listed in the URL-list files echo echo "Now performing step 2: grabbing the URLs' contents" echo ./HomepageSearchEngine.cgi geturls -cat=5 -lang=en -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi geturls -conf=1 -cat=5 -lang=de -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi geturls -cat=6 -lang=en -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi geturls -conf=1 -cat=6 -lang=de -nobackup -debug=options -batchmode # (3) make the file-list file pairs echo echo "Now performing step 3: making the file-list" echo # Since all category 5 and 6's files have been spidered, they are HTML files, so we apply the '-nononhtml' option here: ./HomepageSearchEngine.cgi makelist -cat=5 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -conf=1 -cat=5 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -cat=6 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -conf=1 -cat=6 -nononhtml -nobackup -debug=options -batchmode # Category 1 temporary only contains the Non-HTML files, collected locally (because they cannot be spidered). So we apply the '-nohtml' option here: ./HomepageSearchEngine.cgi makelist -cat=1 -nohtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -conf=1 -cat=1 -nohtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -cat=2 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi makelist -conf=1 -cat=2 -nobackup -debug=options -batchmode # Since category 3 of both configuration sets represents the same files, it only needs to be indexed once: ./HomepageSearchEngine.cgi makelist -cat=3 -nobackup -debug=options -batchmode # (4) create the index file pairs echo echo "Now performing step 4: indexing" echo # The same rules applied to the "makelist" command above also apply for the "index" command here: ./HomepageSearchEngine.cgi index -cat=5 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -conf=1 -cat=5 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -cat=6 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -conf=1 -cat=6 -nononhtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -cat=1 -nohtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -conf=1 -cat=1 -nohtml -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -cat=2 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -conf=1 -cat=2 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi index -cat=3 -nobackup -debug=options -batchmode # (5) change the URLs in the indexes back to their original ones echo echo "Now performing step 5: changing the URLs' names" echo ./HomepageSearchEngine.cgi changeurls -cat=5 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi changeurls -conf=1 -cat=5 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi changeurls -cat=6 -nobackup -debug=options -batchmode ./HomepageSearchEngine.cgi changeurls -conf=1 -cat=6 -nobackup -debug=options -batchmode # (6) do some additional tasks echo echo "Now performing step 6: doing some additional tasks" echo # Add spidered pages to category 1's index: cat hse_index5_html.txt hse_index6_html.txt > hse_index1_html.txt cat conf/1/hse_index5_html.txt conf/1/hse_index6_html.txt > conf/1/hse_index1_html.txt # Provide that index also as main index (used if HomepageSearchEngine is called without a "cat" delivery parameter) - # So make a symbolic link of it (which does not waste disc space): rm -f hse_index_*.txt ln -sf hse_index1_html.txt hse_index_html.txt ln -sf hse_index1_nonhtml.txt hse_index_nonhtml.txt cd conf/1 rm -f hse_index_*.txt ln -sf hse_index1_html.txt hse_index_html.txt ln -sf hse_index1_nonhtml.txt hse_index_nonhtml.txt # Make category 3's index also accessable by configuration set 1: rm -f hse_index3_*.txt ln -sf ../../hse_index3_html.txt hse_index3_html.txt ln -sf ../../hse_index3_nonhtml.txt hse_index3_nonhtml.txt cd ../.. # Finally, we change the current directory back to the original one: cd $DIR echo echo "Now finished" echo