/* This will take an html document and find all <a and <img links.
  It will not find "FORM links.
*/


/********** BEGIN USER changeable parameters ***********/

/* default "base url" -- only used if a file:// url is entered 
   For example, to start at d:\www\guide.htm, you'ld enter
   file://d:\www\guide.htm 
   You'ld then need to enter the "default" address (for use with
   relative URLS contained in this file)
*/
defbaseurl='http://www/'

/* default root directory -- only used if a file:// url is entered 
*/
defrootdir='/'

/* default name (for urls that end with / ). 
  For example,
    given a link of "<a href="/sports/scoreboard/">
        and def_tofile='index.htm'
    then the contents of this url would be written to:
        destination_dir\sports\scoreboard\index.htm   */
def_tofile='INDEX.HTM'


/*If HTML Document mode is selected, then
     only links ending with these extensions are downloaded, examined, and written.  
  Notes:
   * In all cases, if the content-type header is NOT text/html,
     the contents will NOT be examined.
   * If HTMLEXTS='', then this test is not performed  
   * If NOT_HTMLEXTS='', then this test is not performed  */
htmlexts='SHTML SHT HTM HTML HTML-SSI HTM-SSI'

/*  log file. If none desired, set=0. Otherwise, enter a 
    filename. Note that old log files will be deleted/overwritten */
logfile='GRABSITE.LOG'

/* nocgi=1 to skip CGI urls (that have a /CGI in their path */
nocgi=1

/* space delimited list of strings that signfies "this is a 
   CGI, or other protocol, script". If left blank, /CGI is used.
   Notes:
     * if ANY of these strings occur anywhere in the url, the url
        will NOT be retriered 
     * a case insensitive comparison is used    */
nocgi_strings='/CGI'

/* nosearch=1 to skip urls that end with ?xxx 
   (where xxx is a string of any length)*/
nosearch=1              


/*If HTML Document mode is selected, then
      links ending with these extensions  are NOT downloaded.
  Notes:
   * If "retrieve all links" mode is specified, then
        not_htmlexts is ignored    
   * If HTMLEXTS<>'', then this test is not performed  
   * If "retrieve all links" mode is specified, then
        htmlexts is ignored    
   * If HTMLEXTS='', then this test is not performed   */
not_htmlexts='JPG GIF BMP ZIP GZ TIF TIFF MOV AU EXE COM WAV XBM PDF PS EPS  '


/* overwrite=1 means "overwrite preexisting files.
   Otherwise, don't overwrite 
*/
overwrite=1


/* optional request header(s) to send to servers
   Note: use '0d0a'x to seperate multiple request headers  */
reqheaders='User-agent: GrabSite'


/* if URL's path starts with remove_prefix, then trim    
   the beginning of the path (remove everything up to the first /) 
   For example, 
     if remove_prefix='!RANGE
     and a link is /!RANGE:bytes=100-200/surplus/prices.lst
     then /surplus/prices.lst is used   */
remove_prefix='!RANGE'


/* If robot_check=1, then check for a /ROBOTS.TXT file. This contains
   instructions on what paths should not be visited by "web robots".
*/
robot_check=1

/* if URL's path starts with skip_prefix, then skip it 
   This is only needed when the "retrieve" test is /.   */
skip_prefix='!'


/* Status reports:
    -2 for NO status output, -1 for minimal, 0 for average
    1 for some, 2 for too much */
verbose=1


/********** END USER changeable parameters ***********/

parse arg afile destdir includer includer2 write_all  
cmdline=0
if afile<>'' then do 
   afile=translate(afile,'/','\')
   cmdline=1
end /* do */

write_all0=write_all

if afile='?' then do
  say "GrabSite -- GET a linked set of pages from the WWW"
  say 
  say "Calling syntax: GrabSite URL DestDir Test1 Test2 Get_all "
  say "   where:"
  say " URL = a fully qualified URL (the home page to start at)"
  say " DestDir = destination directory (on local disk) to write results to"
  say " Test1 = only parse documents in/under this prefix "
  say " Test2 = only retrieve documents in/under this prefix "
  say " Get_all = if 0, then do NOT get non-html documents "
  say " "
  say " Note: to avoid command line problems: use \ instead of /"
  say " "
  say "Example: "
  say "   D:>grabsite http:\\fu.br.net\circ\index.htm d:\foob \circ\ \ 1 "
  say
  say "Or .. enter without arguments for user prompts"
   exit
end /* do */


/* initialize some stuff */
baseurl=''
rootdir=''
includer=translate(translate(includer,'/','\'))
includer2=translate(translate(includer2,'/','\'))
remove_prefix=translate(remove_prefix)
skip_prefix=translate(skip_prefix)
htmlexts=translate(htmlexts)
not_htmlexts=translate(not_htmlexts)
ndeleted=0
nwritten=0 ; noconnects=0
ngets=0; n400s=0
nparsed=0
if nocgi_strings='' then nocgi_strings='/CGI'

crlf='0d0a'x
fileurls.0=0
flist.0=0

call loaddll            /* load some dlls, set some parameters */



say
say "         "cy_ye"GrabSite -- GET a set of linked documents from a WWW site"normal
say


if logfile=0 | logfile=' ' then do
    logfile=0
end /* do */
else do
  aa=stream(logfile,'c','query exists')
  if aa<>'' then do
     foo=sysfiledelete(logfile)
     if verbose>0 then say "Old logfile deleted: "logfile
     call lineout logfile,'GrabSite log file. Created '||time('n')||' '||date('n')
  end /* do */
end

/***** determine file/url to read, and other info */
say 
jump1: nop
if afile='' then do
   afile=getstring("Home page to grab, or enter ? for a brief description.",'?',reverse'  1)'normal)
   if afile='?' then do
        call helpme1
        afile=''
        signal jump1
   end /* do */
end
afile=strip(afile)
afileu=translate(strip(afile))
if abbrev(afileu,'FILE://') then do
   parse var afile . '://' afile
   afileu=translate(afile)
   afile_isurl=0
   if stream(afile,'c','query exists')="" then do
       call printsay "No such file: "afile
       exit
   end /* do */
   call printsay "        ... reading "||cutstrg(afile,50) "...."
   stuff=charin(afile,1,chars(afile))
   afile_isurl=0
   if baseurl='' then baseurl=getstring("Default site (the dotted ip address)",defbaseurl,reverse' 1a)'normal)
   if rootdir='' then rootdir=getstring("Default 'root' directory ",defrootdir,reverse' 1b)'normal)
end             /* local file as base */
else do         /* it's a url */
   if abbrev(afileu,'HTTP://')<>1 then do
        afile='http://'afile
        afileu=translate(afile)
   end /* do */
   afile_isurl=1
   parse var afile . '://' bb1 '/' bb2
   baseurl=bb1
   ii=lastpos('/',bb2)
   if ii=0 then 
      rootdir='/'
   else
      rootdir=left(bb2,ii)
end                   /* url entry */

if pos('://',baseurl)=0 then baseurl='http://'||baseurl
rootdir=strip(rootdir)
if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
baseurl=strip(strip(baseurl,'t','/'))


/* destination directory */
atdestdir: nop
if destdir='' then do
  destdir=getstring("Enter a destination directory ",directory(),reverse'  2)'normal)
  if destdir="?" then do
     call helpme1
     destdir=''
     signal atdestdir
  end /* do */
  didit=sysmkdir2(destdir,1)
  if didit<>0 then do
     say "Could not access, or create, "destdir
     exit
  end /* do */
end
else do
   destdir=strip(strip(destdir),'t','\')'\'
   didit=sysmkdir2(destdir,1)
   if didit<>0 then do
     say "Could not access, or create, "destdir
     exit
   end /* do */
end
destdir=strip(strip(destdir),'t','\')'\'

/* get and set includers variables */
call get_includers

/* Quick/skeleton mode */
getquick:nop
if cmdline<>1 then do
 do until write_all<>''
   al=getstring(' HTML documents only (Yes, No, or ? for help)','N',reverse'  4)'normal)
   al=strip(translate(al))
   if al='?' then do
     al=''
     call help_writeall
     iterate
   end
   if abbrev(al,'N')=1 then
      write_all=1
   else
     write_all=0
 end
 if write_all=0 then
  call printsay "Ignoring non-html documents"
 else
  call printsay "Retrieving all links "
 say

/* modify other parameters */
if write_all0='' then do
  if yesno(" Would you like to modify configuration parameters?")=1 then do
     call modify_config
  end /* do */
end

end


/************** Done with user input **********/



/*******  copy file/url to destdir */
/* if local file, copy directly to destidr
   if url, then maybe copy relative to destdir 
*/

if afile_isurl=0 then do    /* local file  -- jump start*/
  ff=translate(afile,' ','\/')
  ff2=word(ff,words(ff))
  ff2=destdir||ff2
  say bold"Saving to "normal|| ff2
  foo=translate(stream(ff2,'c','open write'))
  if foo<>'READY:' then do
     say "Could not open file for writing. Error was: " foo
     exit
  end /* do */
  foo=charout(ff2,stuff,1)
  if foo<>0 then do
      say "Error. Problem writing file "
      exit
  end /* do */
  foo=stream(ff2,'c','close')

  goo=time('e')                /* get stuff from file */
  foo=urls_in(stuff,baseurl,rootdir,afile)
  goo2=time('e')
  if verbose>0 then do
    if goo2-goo>5 then call printsay "  ... done parsing "||cutstrg(afile,50)
    call printsay  ' '
    call printsay "    " cy_ye " # links in "normal||bold||afile"="normal||"  "||fileurls.0
  end
  nparsed=1

end
else do                 /* a url */
   iurls=1
   uaref=translate(strip(afile))
   flist.uaref=1
   flist.0=1
   fileurls.iurls=afile
   fileurls.iurls.!ref='user'
   fileurls.0=iurls
end /* do */

/**** get a robot.txt file first? */
if robot_check=1 then do
   aurl=baseurl'/robots.txt'
   rlist=get_url(aurl)
   exclist=add_robot(rlist)
   if verbose>0 then do
       call printsay "Excluding: "exclist ;  call printsay ' '
   end 
   exclist.0=0
   if exclist<>'' then do
       do ii=1 to words(exclist)
          exclist.ii=translate(strip(word(exclist,ii)))
       end /* do */
       exclist.0=words(exclist) 
   end /* do */
end                     /* build exclist. */


/************ Get urls in first file/url */
call printsay ' '
if write_all=0 then do
   if length(afile)<40 then do then
      call printsay ' Examining html links starting from:'||bold||afile||normal
   end
   else do
      call printsay ' Examining html links starting from...'
      call printsay '   :'||bold||afile||normal
   end /* do */
end
else do
   if length(afile)<40 then do then
      call printsay ' Examining links starting from:'||bold||afile||normal
   end
   else do
      call printsay ' Examining links starting from...'
      call printsay '   :'||bold||afile||normal
   end /* do */
end
call printsay ' '



/********** now get the urls, parse, add to list.... */
mm=0
do forever
   mm=mm+1
   if mm>fileurls.0 then leave

   goob=fileurls.mm
   f1f=goob
   if length(f1f)>40 then f1f='...'right(goob,36)
   oof=''
   if verbose>0 then oof=']--'||filespec('n',fileurls.mm.!ref)

   parse var goob . '://' bb1 '/' asel
   aselorig=asel
/* check asel for ../ constructions, and fix if found */
   do forever
     if pos('/../',asel)=0 then leave              /* no ../ to remove */
     parse var asel p1 '/../' p2 
     ip0=lastpos('/',p1)
     if ip0=0  then do
         asel='..'
         leave     /* i.e.; http://foo.bar.net/../ are disallowed */
     end /* do */
     asel=left(p1,ip0)||p2
   end /* do */

   if asel='..' then do         /* .. signals an error */
      if verbose>1 then  
           call  printsay "Skipping #"bold||mm||normal||" (too many /../) "||f1f||oof 
      iterate
   end

   if asel<>aselorig & verbose>1 then call printsay "... using " asel

   goob2=translate('HTTP://'||bb1||'/'||asel)

   baseurl=bb1
   ii=lastpos('/',asel)
   if ii=0 then 
      rootdir='/'
   else
      rootdir=left(asel,ii)
   if pos('://',baseurl)=0 then baseurl='http://'||baseurl
   rootdir=strip(rootdir)
   if rootdir<>'/' then rootdir='/'||strip(rootdir,,'/')||'/'
   baseurl=strip(strip(baseurl,'t','/'))

   if robot_no(asel)=1 then do
     if verbose>1 then  call printsay "Skipping #"bold||mm||normal||" (robot exclusion) "||f1f||oof 
     iterate 
  end

   if includer2<>"" then do          /* only GET if in/under this directory */
     if abbrev(goob2,includer2)=0 then do 
        if verbose>1 then  call printsay "Skipping #"bold||mm||normal||" (not in dir) "||f1f||oof        
        iterate
     end
   end

   if nocgi=1 then do           /* cgi? then skip */
     ccok=1
     do kk=1 to words(nocgi_strings)
        if ccok=0 then leave
        acc=strip(translate(word(nocgi_strings,kk)))
        gloop=pos(acc,translate('/'asel))
        if gloop>0 then do
             ccok=0
        end
     end
     if ccok=0 then do
         if verbose>1 then call printsay "Skipping #"bold||mm||normal||" (CGI) "||f1f||oof 
         iterate
     end
   end

   if nosearch=1 then do                /* skip "search string" calls (usually to scripts*/
      if pos('?',asel)>0 then do
        if verbose>1 then call printsay "Skipping #"bold||mm||normal||" (contains ?) "||f1f||oof 
        iterate
      end
   end /* do */

   if skip_prefix<>'' then do   /* ignore if starts with this? */
      if abbrev(asel,skip_prefix)=1 then do
        if verbose>1 then call printsay "Skipping #"bold||mm||normal||" (contains "skip_Prefix") "||f1f||oof 
        iterate    
      end /* do */
   end /* do */

   ara=lastpos('.',asel);anext=''
   if ara>0 then do                   /* check for html type of extentsion*/
       anext=translate(strip(substr(asel,ara+1)))
   end
   if htmlexts<>'' & write_all<>1 then do   /* only get possible htmls */
       if pos(anext,htmlexts)=0 then iterate
   end
   if not_htmlexts<>"" & write_all<>1 then do  /* don't get almost certainly NOT htmls */
       if pos(anext,not_htmlexts)>0 then iterate
   end /* do */

   if verbose>-1 then call printsay "Checking "bold||mm||normal||" of "fileurls.0")"||f1f||oof

/* get the url */
   goo=time('e')
   stuff=get_url(goob,,verbose,reqheaders)
   goo2=time('e')
   if goo2-goo>5 & verbose>0 then call printsay "  .... done GETting "||cutstrg(goob,50)

   if stuff="" then do
      if verbose>1 then call printsay "Skipping #"bold||mm||normal||" (unable to connect) "||f1f||oof 
      noconnects=noconnects+1
      iterate
   end /* do */

   ngets=ngets+1
   call extracts                /* extract body and head */
/* look for return code */
   parse var response_line . icode .
   
   r1=left(response_code,1)
   if r1=4 | r1=5 | r1=1  then do              /* error response */
      n400s=n400s+1
      iterate
   end /* do */

   
/* get the content-type */
   ss='!CONTENT-TYPE'
   if translate(headers.ss)<>'TEXT/HTML' then do  /* not html -- don't parse */
     if write_all=1 then call url_to_file goob2  /* but possibly save to disk */
     iterate /* don't bother parsing this */
   end

/* does it satisfy the INCLUDER test? */
   if includer<>"" then do          
     if abbrev(goob2,includer)=0 then do
        call url_to_file goob2
        iterate /* don't bother parsing this */
     end /* do */
   end

/* extract links, but first write it to disk */
   call url_to_file goob2
   if result=0 then iterate

/* if here, extract urls and add to list */
   eek=fileurls.0
   goo=time('e')
   if verbose>0 then call printsay "  .... parsing "||cutstrg(goob,50)

   if r1=3 then do      /* redirect -- extract location header */
     ss='!LOCATION'
     asd=strip(headers.ss)
     if asd<>'' then do
        stuff=stuff||'<a href="'asd'"> '   /* convert location header to link (a small hack */
     end /* do */
   end /* do */

   foo=urls_in(stuff,baseurl,rootdir,goob)
   goo2=time('e')
   if goo2-goo>5 & verbose>0 then call printsay "   ... done parsing "||cutstrg(goob,50)

   nparsed=nparsed+1
   if verbose>1 then do
      if eek<fileurls.0 & verbose>0 then call printsay "  new links to check: "bold||(fileurls.0-eek)||normal
   end

end  /* ******* Read a url */

/**** Status info */
call printsay ' '
call printsay ' ------- Status: '
call printsay "Total number of unique URLs: "fileurls.0
call printsay "Total number retrieval attempts: " ngets '(400s='n400s'. No Connect='noconnects')'
call printsay "Total number of parsed pages: "nparsed
call printsay "Total number of files written: " nwritten '(files deleted='ndeleted')'
call printsay " "
call printsay "Reminder: files are written to "bold||destdir||normal
if logfile<>0 then do
    say '                ** The log file is: ' logfile
    call lineout logfile
end
exit


/********/
/* modify configuration parameters */
modify_config:

params="def_tofile htmlexts logfile not_Htmlexts overwrite robot_check reqheaders "
params=params||"reqheaders verbose nocgi nocgi_strings nosearch remove_prefix skip_prefix"

params=translate(params)
say

do forever
aa=getstring("Select a parameter to modify (?=list,??=current values, X=done)","?",reverse" -->"normal)
if aa="?" then do
   say
   say "       "reverse"Configuration Parameters: "normal
   say bold"  DEF_TOFILE"normal"= default filename, used when a URL does not contain a filename"
   say bold"    HTMLEXTS"normal"= HTML extensions (if quick mode selected, only files with these "
   say "              extensions are retrieved)"
   say bold"     LOGFILE"normal"= Name of logfile (results are recorded here)"
   say bold"       NOCGI"normal"= If 1, do NOT retrieve URLs containing a NOCGI_STRINGS"
   say bold"       NOCGI_STRINGS"normal"= List of strings that signify CGI script, or other types"
   say      "                       of URLs to skip"
   say bold"    NOSEARCH"normal"= If 1, do NOT retrieve URLs that end with a ?xxxx "
   say bold"NOT_HTMLEXTS"normal"= non-HTML extensions (if quick mode selected, files with these "
   say "              extensions are ignored)"
   say bold"   OVERWRITE"normal"= If 1, then overwrite preexisting files "
   say bold"REMOVE_PREFIX"normal"= If the URL's path starts with this, then trim the   "
   say     "              beginning of the path (remove everything up to the first /) "
   say bold" SKIP_PREFIX"normal"= If the URL's path starts with this, then skip it "
   say bold"     VERBOSE"normal"= If 1, verbose mode "
   say

   iterate
end /* do */
if aa="??" then do
   say
   say "       "reverse"Current values of configuration Parameters: "normal
   say bold"  DEF_TOFILE"normal"= "def_tofile
   say bold"    HTMLEXTS"normal"= "htmlexts
   say bold"     LOGFILE"normal"= "logfile
   say bold"       NOCGI"normal"= "nocgi
   say bold"       NOCGI_STRINGS"normal"= "nocgi_STRINGS
   say bold"    NOSEARCH"normal"= "nosearch
   say bold"NOT_HTMLEXTS"normal"= "not_htmlexts
   say bold"   OVERWRITE"normal"= "overwrite
   say bold"REMOVE_PREFIX"normal"= "remove_Prefix
   say bold"   REQHEADERS"normal"= "reqheaders
   say bold"  ROBOT_CHECK"normal"= "robot_check
   say bold" SKIP_PREFIX"normal"= "skip_prefix
   say bold"     VERBOSE"normal"= "verbose
   say
   say "Note: you can permanently change these values by editing GRABSITE.CMD"
   say
   iterate
end /* do */
aa=translate(strip(aa))
if aa='X' then leave
if wordpos(aa,params)=0 then do
   say "No such parameter: " aa
end /* do */
else do
   aaold=value(aa)
   bb=getstring("Enter new value for "aa,aaold,bold"    --->"normal)
   foo=value(aa,bb)
end
   

end

return 0



/********/
/* get and set includer and includers2 */
get_includers:

include1: nop
if includer='' then do
  includer=getstring(" Only GET & examine & save urls in or under (? for help) ",rootdir,reverse' 3)'normal)
end
if includer="?" then do
     call help_includer
     includer=''
     signal include1
end /* do */
includer=translate(includer)

include2: nop
if includer2='' then do
    includer2=getstring(" Only GET & save urls that being with ",includer,reverse' 3b)'normal)
end
if includer2="?" then do
     call help_includer
     includer2=''
     signal include2
end /* do */
includer2=translate(includer2)

if includer='' then
   includer=baseurl||rootdir
else
   includer=baseurl||'/'strip(includer,'l','/')
say 
if length(includer)<50 then do
   call printsay "Only examining URLs in/under: "includer
end
else do
   call printsay "Only examining URLs in/under... "
   call printsay "   : "includer
end /* do */

if includer2='' then
      includer2=baseurl||'/'
else
     includer2=baseurl||'/'strip(includer2,'l','/')

if length(includer2)<50 then do
   call printsay "Only retrieving URLs in/under: "includer2
end
else do
   call printsay "Only retrieving URLs in/under ... "
   call printsay "   : "includer2
end 

includer=translate(includer)
includer2=translate(includer2)
len_includer2=length(includer2)
say 
return 0


/**************************************************/
/* copy a url to a file */
url_to_file:

parse arg afil

goob2=translate(afil)
if includer2<>"" then do          /*relative to includer2 directory */
     tofile=substr(goob2,len_includer2)
end
else do
   parse var afil . '://' . '/' tofile
end /* do */

if tofile='' then do
      call printsay "   ERROR: could not write " afil
      call printsay "      ("goob")"
      return 0
end /* do */

if tofile='' | right(tofile,1)='/' then tofile=tofile||def_tofile

/* save  to destidr */
tofile=translate(tofile,'\','/')
tofile=strip(strip(tofile),'l','\')

tofile2=destdir||tofile
todir=filespec('d',tofile2)||filespec('p',tofile2)
mkit=sysmkdir2(todir)
   
yow=stream(tofile2,'c','query exists')
if yow<>'' then do
   if overwrite=2 then do
      if verbose>-1 then call printsay "  "||cy_ye||tofile2||normal " old version used."
      return 1 /* use old copy */
   end
   if overwrite=1 then do
            if verbose>0 then call printsay "  .... deleting "tofile2
            foo=sysfiledelete(tofile2)
            ndeleted=ndeleted+1
   end /* do */
   else do      
         call printsay "  > "tofile2 " exists; "bold"skipping "normal
         return 0
   end /* do */
end /* do */

foo=stream(tofile2,'c','open write')
wow=charout(tofile2,stuff,1)
if wow<>0 then do
      call printsay "   ERROR: could not write " tofile2
      call printsay "      ("goob")"
      return 0
end /* do */
foo=stream(tofile2,'c','close')
if foo="READY:" then do
      if verbose>-2 then call printsay "  "||cy_ye||tofile2||normal " written."
end /* do */


nwritten=nwritten+1
return 1                /* sets globals */


/********************/
/* search a file, find IMG SRC= and A HREF= urls. Add BASEURL if
   no / or http://.../ at beginning of URL */

urls_in:procedure expose  fileurls. flist. remove_prefix bold normal logfile reverse cy_ye

parse arg stuff, baseurl,rootdir,stuffname

/* remove comments */
body=""
do forever              /*no comments within comments are allowed */
   if stuff="" then leave
   parse var stuff t1 '<!-- ' t2 '-->' stuff
   body=body||t1
end /* do */
stuff=body
body=''

if verbose=1 then call printsay "Parsing "||length(stuff)||' characters'
/* find all IMG SRC= and A HREF=, FRAME= throw away internal links */
do until stuff=""
    parse var stuff . '<' anarg '>' stuff
    aref=afindsrc(anarg)

    if aref='' then iterate
    uaref=translate(aref)
    if abbrev(uaref,'MAILTO:')=1  then iterate  /* only keep https */
    if abbrev(uaref,'FTP:')=1  then iterate
    if abbrev(uaref,'GOPHER:')=1  then iterate

/* fix up name to be fully qualified url */
     select
          when  abbrev(translate(aref),'HTTP://')=1 then nop
          when abbrev(aref,'/')=1  then aref=baseurl||aref
          otherwise aref=baseurl||rootdir||aref
     end

/* check for remove_prefix entries */
     if remove_prefix<>'' then do
              parse var aref a1 '://' a2 '/' aaurl
              if abbrev(translate(aaurl),translate(remove_prefix))=1 then do
                    parse var aaurl  . '/' aaurl
                    aref=a1'://'a2'/'aaurl
                    if verbose=1 then call printsay "   > " remove_prefix "removal yields: "aref
              end /* do */
     end /* do */

/* record this entry only if not yet recorded -- else, just increment counter */
     uaref=translate(aref)
     if datatype(flist.uaref)<>'NUM' then flist.uaref=0
     flist.uaref=1+flist.uaref
     flist.0=flist.0+1
     if flist.uaref=1 then do
        iurls=fileurls.0+1
        fileurls.iurls=aref
        fileurls.iurls.!ref=stuffname
        fileurls.0=iurls
     end
end /* do */

return iurls


/*****************/
/* get a string from user */
getstring:procedure expose normal bold reverse logfile cy_ye
parse arg prompt,def,prompt0
abold=bold
if bold="BOLD" then abold=''
anormal=normal
if normal='NORMAL' then anormal=''

l1=length(prompt)
l2=length(def)
if l1+l2>38 then do
   say prompt0' 'abold||prompt||anormal
   if l2>22 then do
     say '      (ENTER='abold||def||normal')'
     call charout, bold"     ? "normal
     parse pull ans
   end /* do */
   else do
     call charout,'      (ENTER='abold||def||anormal')? '
     parse pull ans
   end
end
else do
  call charout,prompt0' 'bold||prompt||normal' (ENTER='abold||def||anormal')? '
  parse pull ans
end
if ans='' then ans=def
return ans



/* ---------------------------------------------*/
/* get a url from some site, return first
maxchar characters (if maxchar missing, get 10million (the whole thing?)
  call as:   stuff=get_url(aurl,maxchar,verbose,headers)
where:
  aurl: the url to GET (required)
the other 3 are optional:
  maxchar: max chars to get (default=10,000,000)
  verbose: verbose mode (default=OFF)
  headers: list of extra request headers, CRLF delimited 
*/
/* ---------------------------------------------*/



get_url:procedure expose logfile bold normal reverse cy_ye
parse arg aurl,maxchar,verbose,headers

if maxchar="" then maxchar=10000000

got=""
if abbrev(translate(aurl),'HTTP://')=0 then do
  if verbose>0 then call printsay "Error: URL not properly specified (it must begin with HTTP://)"
  return ''
end

parse var aurl . '://' server '/' request

if VERBOSE>1 then do 
  if length(server||request)<65 then do
     call printsay "  GETting: " server ", " request
  end
  else do
     call printsay "  GETting: " server " " 
     call printsay "           " request
  end /* do */
end /* do */

/* now get the url.  This requires the RxSock.DLL be in your LIBPATH. */

/* Load RxSock */
    if \RxFuncQuery("SockLoadFuncs") then nop
    else do
       call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
       call SockLoadFuncs
    end

    crlf    ='0d0a'x                        /* constants */
    family  ='AF_INET'
    httpport=80

    rc=sockgethostbyname(server, "serv.0")  /* get dotaddress of server */
    if rc=0 then do
        call printsay '    Unable to resolve "'server'"'
        return 0
    end
    dotserver=serv.0addr                    /* .. */
    gosaddr.0family=family                  /* set up address */
    gosaddr.0port  =httpport
    gosaddr.0addr  =dotserver

    gosock = SockSocket(family, "SOCK_STREAM", "IPPROTO_TCP")

    /* Set up request  */
    message="GET /"request' HTTP/1.0 'crlf||'Host: 'server||crlf
    if length(headers)>2 then do
       if right(headers,2)=crlf then headers=left(headers,length(headers)-2)
    end
    if headers<>'' then message=message||headers||crlf
    message=message||crlf

    got=''
    rc = SockConnect(gosock,"gosaddr.0")
    if rc<0 then do
        call printsay '     Unable to connect to "'server'"'
        return 0
    end
    rc = SockSend(gosock, message)

 /* Now wait for the response */

   do r=1 by 1
     rc = SockRecv(gosock, "response", 1000)
     got=got||response
     if rc<=0 then leave
     tmplen=length(got)
     if tmplen> maxchar then leave
  end r

  rc = SockClose(gosock)

return got




/* --- Load the function library, if necessary --- */
loaddll:

if RxFuncQuery("SockLoadFuncs")=1 then do      /* already there */
  call RxFuncAdd "SockLoadFuncs","rxSock","SockLoadFuncs"
  call SockLoadFuncs
end

foo=rxfuncquery('sysloadfuncs')
if foo=1 then do
  call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs'
  call SysLoadFuncs
end

/****
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
 call rxfuncadd 'rexxlibregister','rexxlib', 'rexxlibregister'
 call rexxlibregister
end
foo=rxfuncquery('rexxlibregister')
if foo=1 then do
    say " Could not find REXXLIB "
    exit
end 
***/

ansion=checkansi()
if ansion=1 then do
  aesc='1B'x
  cy_ye=aesc||'[37;46;m'
  normal=aesc||'[0;m'
  bold=aesc||'[1;m'
  re_wh=aesc||'[31;47;m'
  reverse=aesc||'[7;m'
end
else do
  say " Warning: Could not detect ANSI....  output will look ugly ! "
  cy_ye="" ; normal="" ; bold="" ;re_wh="" ;
  reverse=""
end  /* Do */

return 1

/* -------------------- */
/* get a yes or no , return 1 if yes */
yesno:procedure expose normal reverse bold logfile cy_ye
parse arg fooa , allopt,altans
if altans<>" " & words(altans)>1 then do
   w1=strip(word(altans,1))
   w2=strip(word(altans,2))
   a1=left(w1,1) ; a2=left(w2,1)
   a1a=substr(w1,2) ; a2a=substr(w2,2)
end
else do
    a1='Y' ; a1a='es'
    a2='N' ; a2a='o'
end  /* Do */
ayn='  '||bold||a1||normal||a1a||'\'||bold||a2||normal||a2a
if allopt=1 then  ayn=ayn||'\'||bold||'A'||normal||'ll'

do forever
 foo1=normal||reverse||fooa||normal||ayn
 call charout,  foo1 normal ':'
 pull anans
 if abbrev(anans,a1)=1 then return 1
 if abbrev(anans,a2)=1 then return 0
 if allopt=1 & abbrev(anans,'A')=1 then return 2
end

nocon:
if rc=-7 then return 0
exit 0


 /* ------------------------------------------------------------------ */
 /* function: Check if ANSI is activated                               */
 CheckAnsi: PROCEDURE
   thisRC = -1

   trace off
                         /* install a local error handler              */
   SIGNAL ON ERROR Name InitAnsiEnd

   "@ANSI 2>NUL | rxqueue 2>NUL"

   thisRC = 0

   do while queued() <> 0
     queueLine = lineIN( "QUEUE:" )
     if pos( " on.", queueLine ) <> 0 | ,                       /* USA */
        pos( " (ON).", queueLine ) <> 0 then                    /* GER */
       thisRC = 1
   end /* do while queued() <> 0 */

 InitAnsiEnd:
 signal off error
 RETURN thisRC

/*************************/
/* return 1 if adir is an existing (possibly empty) directory , 0 if not */
dosisdir2:procedure 
parse arg adir

adir=strip(adir)
adir=strip(adir,'t','\')
nowdir=directory()
nowdrive=filespec('d',nowdir'\')
nowpath=filespec('p',nowdir'\')
adr=filespec('d',adir)
if adr='' then do
   if abbrev(adir,'\')=0 then 
       adir=nowdrive||nowpath||adir
   else
       adir=nowdrive||adir
end /* do */

foo=sysfiletree(adir,goo,'D')
if  goo.0>0  then return 1
return 0



/*************************************/
/* parse GETten stuff to globals
  response_line = the response line  
  response_code = the 200, 401, etc. code
  headers. = list of response headers
  stuff = the contents (the file)
*/
extracts:
cr='0a'x
parse var stuff response_line (cr) stuff
parse var response_line . response_code .
response_line=strip(response_line,,'0d'x)
  headers.0=''
  do forever
    parse var stuff  ahead  (cr) stuff
    ahead=strip(ahead,,'0d'x)
    if ahead='' then leave
    parse var ahead name ':' aval
    nn=translate('!'||name)
    headers.0=headers.0' 'nn
    headers.nn=aval
  end /* do */

/* remove html comments */
return 1


/* ------------- */
/* create a directory, arbitrarily deep.
Returns 0 if succes, otherwise returns an error code 
adir: directory to create -- must be fully qualified.
verbose: if 1, will write some status stuff to screen
*/

sysmkdir2:procedure
parse arg adir,verbose

adir=strip(adir,'t','\')

if dosisdir2(adir)=1 then do  /* already exists */
   if verbose=1 then say "       Using pre-existing directory: "adir
   return 0
end /* do */

ff=sysmkdir(adir)
if ff=0 then return ff

/* make the tree */
f2=adir'\'
dd=filespec('d',f2)
pp=filespec('p',f2)
if pp='\' | pp='' then return -1

pp2=strip(translate(pp,' ','\'))

do mm=1 to words(pp2)
   a1=subword(pp2,1,mm)
   a1=translate(a1,'\',' ')
   dd2=dd'\'a1
   hoo=sysmkdir(dd2)
   if hoo=0 & verbose=1 then call printsay '     ... creating: 'dd2  
end /* do */

return hoo


/****************/
/* URL and DESTDIR help info */
helpme1:

say
say bold"GrabSite"normal" is designed to copy a WWW site to your local hard disk. "
say 
say "It's easy to use: just specify a URL, and then specify a directory"
say "on your hard drive to copy the web pages (and other files) retrieved"
say "from this WWW site."
say
say "For example: suppose the 'home page' is"
say "     www.coolstuff.org/games/expert.htm"
say "and the 'destination directory' is:"
say "     d:\localweb\game10 "
say "Then..."
say "  a) GrabSite will GET (using socket calls) the /games/expert.htm HTML "
say "     document at www.coolstuff.org."
say "  b) A copy of /games/expert.htm will be written to d:\localweb\games10 "
say "  c) /games/expert.htm  will be scanned for links "
say "  d) For each link found, repeat step a (changing names appropriately)"
say 
say "Note: For hints on running from command line, run GrabSite with a ? argument."
say"       Example: D:>GrabSite ? "
say 
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1

/****************/
/* INCLUDER help info */
help_Includer:
say 
say "You can, and should, limit the scope of "bold"GrabSite"normal"'s WWW downloads"
say "(If you don't, you could end up downloading a significant chunk of the WWW!)"
say
say "There are two tests used to limit scope: "
say
say " a) Limiting what URLS are "bold"downloaded"normal" and "bold"examined"normal"."
say "    URLS that pass this test are retrieved (and saved to disk). "
say "    If they are text/html documents they will also be 'parsed' --"
say "    the links found in these text/html documents may also be retrieved."
say
say " b) Limiting what URLS are "bold"downloaded"normal", but "bold"not"normal" examined."
say "    URLS that pass this less stringent test are downloaded (and saved to disk)."
say "    They are "bold"not"normal" parsed -- links they may contain are ignored."
say
say " By using two tests, one can:"
say "  i) 'Recursively GET'  URLS thar are in (or under) the directory "
say "     of the 'home page' you selected. "
say " ii) Download & save (but not examine) files pointed to by these pages. "
say "     For example, .GIF files stored on a different part of the site."
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1

/****************/
/* writeall help info */
help_writeall:
say 
say "You can either: "
say " a) Download all documents, images, etc. from the site (more precisely,"
say "     documents, etc. that satisfy the 'scope tests')"
say " b) Only download HTML documents "
say
say "The latter option is useful if you want a quick snapshot of the navigable"
say "portion of the site -- if you do not care about images, text files, and "
say "other such 'non-html' contents."
say
say "If you select this latter option, the following rule is used: "
if htmlexts<>'' then
  say "Only retrieve links ending with: "htmlexts
else
 say "Retrieve links that do NOT end with: "not_htmlexts
say
say cy_ye" Note: Configuration hint:"normal
say "  You can modify this rule by changing the HTMLEXTS and NOT_HTMLEXTS parameters"
say
call charout,reverse"Hit any key to continue "normal
foo=sysgetkey('noecho')
say
return 1
  

/***************/
/* cut length of string to nn characters, if necessary */
cutstrg:procedure
parse arg astr,ilen
if ilen='' then return astr

if length(astr)<ilen then return astr
aa=left(astr,14)'...'||right(astr,33)
return aa


/***************/
/* say, and possible lineout, output */
printsay:procedure expose logfile bold normal reverse cy_ye
parse arg aval

say aval

aval=removestrg(aval,bold)
aval=removestrg(aval,normal)
aval=removestrg(aval,reverse)
aval=removestrg(aval,cy_ye)
if logfile<>0 then call lineout logfile,aval
return 0

/***********************************/
/* search a file, find IMG SRC=, FRAME SRC=, and A HREF= urls. Add BASEURL if
   no / or http://.../ at beginning of URL 
   Return results in hrefs. and imgs. */

afindsrc:procedure

parse arg anarg
parse var anarg htype stuff

htype=translate(strip(htype))

anarg=translate(anarg,' ','0d0a0900'x)

/* find all  FRAME SRC=, IMG SRC= and A HREF=, throw away internal links */

chklist='BODY IMG A FRAME AREA EMBED LINK APPLET '
anctype=wordpos(htype,chklist)

if anctype=0 then return ''      /* not a url containing element */

/* depending on anctye, look for different things */
select 
   when anctype=1 then do           /* body background */
      do forever
         if anarg=''  then return ''            /* nothing found */
         parse var anarg a1 anarg ; a1=strip(a1)
         if abbrev(translate(a1),'BACKGROUND=')=0 then iterate
         parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
         return gotimg
      end /* do */
   end                              /* i3>0 */

   when anctype=2 then do                /* img */
         do forever
            if anarg=''  then return ''
            parse var anarg a1 anarg ; a1=strip(a1)
            if abbrev(translate(a1),'SRC=')=0 then iterate
            parse var a1 . '=' gotimg . ; gotimg=strip(strip(gotimg),,'"')
            return gotimg
         end /* do */
   end

   when anctype=3 | anctype=5  | anctype=7 then do /* A AREA LINK */
         do forever
            if anarg=''  then leave
            parse var anarg a1 anarg ; a1=strip(a1)
            if abbrev(translate(a1),'HREF=')=0 then iterate
            parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')

            parse var gothref gothref '#' .     /* toss out internal jumps */
            if gothref="" then return ""
            if abbrev(translate(gothref),'JAVASCRIPT:') then  return "" /* don't do "javascript:" entries */

            return gothref

         end /* do */
    end

    when anctype=4 | anctype=6 then do   /* FRAME EMBED */
         do forever
            if anarg=''  then leave
            parse var anarg a1 anarg ; a1=strip(a1)
            if abbrev(translate(a1),'SRC=')=0 then iterate
            parse var a1 . '=' gothref . ; gothref=strip(strip(gothref),,'"')

            parse var gothref gothref '#' .     /* toss out internal jumps */
            if gothref="" then return ""
            return gothref
         end /* do */
    end

   when anctype=8 then do   /* APPLET */
         abase=''; aref=''
         do forever
            if anarg=''  then leave
            parse var anarg a1 anarg ; a1=strip(a1)
            if abbrev(translate(a1),'CODE=') + ,
               abbrev(translate(a1),'CODEBASE=')=0 then iterate
                
            if abbrev(translate(a1),'CODEBASE=')=1 then do
                    parse var a1 '"' abase '"' .
             end /* do */
             else do                  /* CODE */
                   parse var a1 '"' aref '"'
             end /* do */
             if aref<>'' & abase<>'' then leave
          end
          
          if aref='' then return ''      /* no CODE= found */

          if abase<>'' then   aref=strip(abase,'t','/')||'/'||strip(aref,'l','/')
          return aref

   end

        
   otherwise return ''
end                  /* select */
return ''


/***********/
/* remove substring */
removestrg:procedure
parse arg aval,astr

if pos(astr,aval)=0 then return aval

aa=''
do forever
   if aval='' then leave
   parse var aval a1 (astr) aval
   aa=aa||a1
end
return aa

/**************/

/******************************/
/* parse a robots.txt file, 
The algorithim:
1 ignore # lines (comments)
2a look for user-agent: grabsite lines
2b if none, look for user-agent:*  lines
3 if 2a or 2b don't match, then no robot disallows exist
4 otherwise, from the look for disallow lines going starting from 
  the user-agent line, until the first empty line (use 0a as line delimiter,
  and throw away the 0d)
5 add from each disallow: asel to exclusion_list

---------------
# samples robots.txt -- will add cgi-* to exclusion_list

user-agent: mozilla
Disallow: /samples
Disallow: /stuff/

#user-agent: checklink
user-agent:gizmo
disallow:fes/

user-agent:*
disallow:cgi-

---------------

*/

add_robot:procedure expose verbose 
parse arg abody

parse var abody . icode .
if left(strip(icode),1)<>2 then return ''       /* not 200 code, so no disallows */

cr='0a'x

do forever              /* get rid of response header */
  if abody='' then return ''    /* nothing in body */
  parse var abody al1 (cr) abody
  al1=strip(al1,,'0d'x)
  if al1='' then leave  /* found empty line*/
end

nn=0
do forever
  if abody='' then leave
  parse var abody al1 (cr) abody
  al1=strip(al1,,'0d'x)
  if al1='#' then iterate
  parse var al1 al1a '#' .
  nn=nn+1
  lins.nn=al1a
end
if nn=0 then return '' /* no entries, return */

lins.0=nn

/* look for GRABSITE, or *,  user-agent */
iat=0
do mm=1 to lins.0
   al=strip(lins.mm)
   if abbrev(translate(al),'USER-AGENT')=0 then iterate
   parse var al . ':' dagent ; dagent=translate(strip(dagent))
   if abbrev(dagent,'CHECKLINK')=1 then do
       iat=mm
       leave
   end
   if dagent='*' then do
       iat=mm
   end /* do */
end /* do */

exlist2=''
if iat=0 then return ' ' /* no matching user-agent */
do mm=iat+1 to lins.0
  al=translate(strip(lins.mm))
  if al='' then leave   /* blank line signals end of "record" */
  if abbrev(al,'DISALLOW')<>1 then iterate
  parse var al  . ':' dasel ; dasel=strip(dasel)
  exlist2=exlist2||' '||strip(dasel,'l','/')
end /* do */

return exlist2



/*******************/
/* compare arg against "robot" exclist. -- return 1 if a match */
robot_No:procedure expose exclist.
parse upper arg asel
asel=strip(asel,'l','/')

do mm=1 to exclist.0
   tt=exclist.mm
   if abbrev(asel,tt)=1 then return 1
end /* do */
return 0
