/*
 *  Grabs URL's from the web page on the disk or on the Web
 *  and saves them in the .rc file. External links are
 *  saved as URL objects, local links open a new
 *  web page, URL's in which are saved as separate
 *  URL folder. Can work recursively on a set of pages.
 *  (c) valerius, 2006, Jun 8.
 *  licensed under BSD license ;-)
 */


parse arg args


/* Web page codepage                   */
cp = ''

/*

  opt.  -- Command line options
  opts. -- current WPS object settings

 */

/* Defaults */
opt.url = ''
opt.nestlevel = 1
opt.queflag = 0
opt.infile = ''
opt.urlsonly = 0

objCount = 0

/* System dependent parameters */
parse source opt.OS .

if opt.OS = 'OS/2'    | opt.OS = 'DOS' |,
   opt.OS = 'WINDOWS' | opt.OS = 'NT'  |,
   opt.OS = 'WINNT'
then do
  opt.case1 = 1
  opt.fs_slash = '\'
  opt.NUL   = '\dev\nul'
  opt.suf   = '.exe'
  opt.EOL   = '0D 0A'x
  opt.fixer = '| fixeoln'
  opt.noecho = '@'
end
else do
  opt.case1 = 0
  opt.fs_slash = '/'
  opt.NUL = '/dev/null'
  opt.suf = ''
  opt.EOL = '0A'x
  opt.fixer = ''
  opt.noecho = ''
end

opt.url_slash  = '/'

if opt.case1 then do
  /* Load Utility REXX DLLs              */
  call initRxDlls
  /* Determine operating system codepage */
  hostcp = UniQueryCp()
end

/* Queue name  */
newque   = 'graburls_que'
stackque = 'graburls_stack_que'

/* Expose list */
expList = 'infile opt. opts.',
          'cp hostcp newque que stack',
          'tag tag. text rest',
          'objCount sym.'

/*

  infile -- current processed html file
  tag    -- current processed html tag
  tag.i.name  -- name  of i'th element of tag
  tag.i.value -- value of i'th element of tag

 */

rest = ''

call exposeSymTable

/* Parse script command line  */
call parseCmdLine args

/* Create a private queue for
   reading WGet output        */
if opt.queflag then do
  que = RxQueue('CREATE', newque)
  oldque = RxQueue('SET', que)
end

/* Create a private queue for
   using as folder stack      */
stack = RxQueue('CREATE', stackque)

/*    Output a folder associated
      with this page          */
opts.location = '<WP_DESKTOP>'

oq = RxQueue('SET', stack)
push opts.location
call RxQueue 'SET', oq

if \opt.urlsonly then
  call outFolder , 'Bookmarks'

p = lastpos(opt.fs_slash, opt.infile)
if p > 0
  then opt.dir = delstr(opt.infile, p)
  else opt.dir = '.'

/* Process current HTML page */
call processPage opt.infile

/*   Delete a stack          */
call RxQueue 'DELETE', stack

/*   Delete a private queue  */
if opt.queflag then do
  call RxQueue 'SET', oldque
  call RxQueue 'DELETE', que
end


exit 0
/* ------=================------ */
parseCmdLine: procedure expose opt.
args = arg(1)

do while args \= ''

  opt = getarg()

  select

    when abbrev('-h', opt, 2) then do
      /* Output help screen */
      call GiveHelp
      exit 0
    end

    when abbrev('-http', opt, 5) then do
      /* Download a page via HTTP */
      opt.url = getarg()
      opt.queflag = 1
      iterate
    end

    when abbrev('-r', opt, 2) then do
      /* Recursion level */
      opt.nestlevel = getarg()
      iterate
    end

    when abbrev('-u', opt, 2) then do
      /* Output URL's only */
      opt.urlsonly = 1
      iterate
    end

    when pos('-', opt) \= 1 then do
      /* Input file name */
      opt.infile = opt
      iterate
    end

    otherwise nop

  end

end


return
/* ------=================------ */
getarg: procedure expose args

/* Gets one word, or a line, enclosed
   in quotes, from args               */

args = strip(args)

if pos('"', args) == 1 then
  parse value args with '"' opt '"' args
else
  parse var args opt args


return opt
/* ------=================------ */
processPage: procedure expose (expList)
infile = arg(1)

/* Download a page by WGet */
if opt.queflag then do
  opt.noecho'wget'opt.suf' -O - http://'opt.url' 2>'opt.NUL' 'opt.fixer' | rxqueue 'que
end

q = (infile = '')

if \q then do
  infile = strip(infile, 'B', '"')
  q = ( stream(infile, 'c', 'query exists') \= '' )
end

if q then do
  if infile \= '' then
    ret = stream(infile, 'c', 'open read')

  /*
   * Must be a line like this to properly determine the codepage:
   * <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=koi8-r">
   */

  do while getTag()
    call processTag
  end

  if infile \= '' then
    ret = stream(infile, 'c', 'close')
end
else do
  call lineout 'stderr', 'Error: file 'infile' does not exist!'
  exit -255
end


return
/* ------=================------ */
processTag: procedure expose (expList)

select

  when tag = '!--' then do
    /* Delete comments */
    call getText '-->'
  end

  when tag = 'SCRIPT' then do
    /* Delete scripts */
    call getText '</SCRIPT'
  end

  when tag = 'META' then do
    /* Determine the codepage */
    if cp = '' then do
      do i = 1 to tag.0
        elem = tag.i.name
        if elem = 'CONTENT' then do
          val = tag.i.value
          val = translate(val)
          parse var val one two
          parse var two 'CHARSET=' cp
          leave
        end
      end
      cp = translate(cp)
      p = pos('CP', cp)
      if  p > 0 then cp = delstr(cp, p, 2)
    end
  end

/*
  when tag = 'OL' then do
    objCount = objCount + 1
    call openFolder
    line = getText('</OL', 'LAST')
    do while getTag(line, 'NOINP')
      call processTag
    end
    call closeFolder
  end

  when tag = 'UL' then do
    objCount = objCount + 1
    call openFolder
    line = getText('</UL', 'LAST')
    do while getTag(line, 'NOINP')
      call processTag
    end
    call closeFolder
  end

  when tag = 'LI' then do
    line = getText('<LI', 'LAST')
    do while getTag(line, 'NOINP')
      call processTag
    end
  end
 */

  when tag = 'A' then do
    /* Get text from current tag to the next '</A' */
    text = getText('</A')
    /* Strip all tags from the text */
    text = StripTags(text)
    /* Translate text into host codepage */
    if cp \= '' & opt.case1 then
      text = UniXlat(text, hostcp, cp)
    do i = 1 to tag.0
      elem = tag.i.name
      if elem = 'HREF' then do
        val = tag.i.value
        val = strip(val, 'B', '"')
        call OutLine val, text
      end
    end
  end

  otherwise nop

end


return
/* ------=================------ */
getTag: procedure expose (expList)
line = arg(1)

/*
 *  Gets a next html tag from html file
 *  and gets the next text block between
 *  them, if any.
 *  text before tag --> text
 *  tag --> tag
 *  the rest of the line after a tag --> rest
 *  tag attributes --> tag.
 *  i.e.: name of attr  --> tag.i.name
 *        value of attr --> tag.i.value
 */

line = rest line
rest = ''

do while thereAreLines() | line \= ''
  p = pos('<', line)
  if p > 0 then do
    do while thereAreLines() | line \= ''
      q = pos('>', line)
      if q > 0 then do
        /* Delete tabs */
        line = despace(translate(line, ' ', '   '))
        if pos('<!--', line) = p then do
          parse var line . '<!--' line
          tag = '!--'
          elems = ''
        end
        else do
          r = pos('<', line, p + 1)
          if r > 0 & r < q then do
            /* Skip erroneous tags (non-closed tags) */
            parse var line . '<' . line
            tag = ''
            elems = ''
          end
          else do
            parse var line text '<' u '>' line
            parse var u tag elems
            tag = translate(tag)
            elems = strip(elems)
          end
        end
        i = 0
        /* Parse tag elements */
        do while elems \= ''
          if pos('=', elems) <= 0
            then leave
          i = i + 1
          parse var elems elem '=' '"' val '"' next
          if val = '' then
            parse var elems elem '=' '''' val '''' next
          else
          if val = '' then
            parse var elems elem '=' val next
          elems = strip(next)
          val = strip(val, 'B', '"')
          tag.i.name  = translate(elem)
          tag.i.value = val
        end
        tag.0 = i
        rest = line
        rest = strip(rest)
        line = ''
        return 1
      end
      line = getNonEmptyLine(line)
      line = strip(line)
      rst = ''
    end
  end
  if thereAreLines() then do
    line = getNonEmptyLine(line)
    line = strip(line)
  end
  else
    return 0
end

line = ''


return 0
/* ------=================------ */
getNonEmptyLine: procedure expose (expList)
line = arg(1)

do until \thereAreLines()
  l = getLine()
  l = strip(l)
  if l \= '' then do
    line = line l
    line = strip(line)
    leave
  end
end


return line
/* ------=================------ */
getText: procedure expose (expList)
nexttag = arg(1)

/*
 *   nexttag is a list of tags, delimited by
 *   a semicolon, this function returns a
 *   string from current position in text to
 *   the first occurence of any of tags in
 *   nexttag list.
 *   nexttag = '<UL>;<OL>;<DL>'
 */

rest = strip(rest)

i = 0
do while nexttag \= ''
  i = i + 1
  parse var nexttag nexttag.i ';' nexttag
end
nexttag.0 = i

p = 0
l = 0
do while thereAreLines() | line \= ''
  line = getLine()
  line = rest line
  line = strip(line)
  rest = ''
  do while thereAreLines() | line \= ''
    do i = 1 to nexttag.0
      q = p
      p = pos(translate(nexttag.i), translate(line))
      if q > 0 & p > 0 & p > q then do
        p = q
        l = length(nexttag.i)
      end
    end
    /* Now p is a position of first occurence
        of any of tags in nexttag list        */
    if p > 0 then do
      rest = substr(line, p + l)
      line = delstr(line, p)
      /* Delete end of line symbols */
      line = stripEOL(line)
      return line
    end
    else do
      rest = line
      line = getLine()
      line = rest line
      rest = ''
    end
  end
end


return ""
/* ------=================------ */
thereAreLines: procedure expose (expList)

if opt.queflag then do
  if queued() > 0 then
    return 1
  else
    return 0
end
else do
  return lines(infile)
end


return 0
/* ------=================------ */
getLine: procedure expose (expList)

if opt.queflag then
  parse pull line
else
  line = linein(infile)


return line
/* ------=================------ */
stripTags: procedure
line = arg(1)

/*   Strip all tags from the line          */
/*   If after deleting tags remains an     */
/*   empty line then we change it          */
/*   into alt string of the <img src=...>, */
/*   if any.                               */
/*   <img src=... alt=...> --> alt=...     */

alt = ''
q = pos('<IMG', translate(line))
if q > 0 then do
  r = pos('>', line, q)
  if r > 0 then do
    l = substr(line, q + 5)
    l = delstr(l, r - q - 4)
    m = pos('ALT=', translate(l))
    if m > 0 then do
      l = substr(l, m)
      l = delstr(l, 1, 4)
      l = strip(l)
      if pos('"', l) > 0
        then
          parse var l '"' alt '"' .
        else
          if pos('''', l) > 0
            then
              parse var l '''' alt '''' .
            else
              parse var l alt .
    end
  end
end

do forever
  p = pos('<', line)
  if p > 0 then do
    parse var line head '<' . '>' tail
    line = head tail
  end
  else
    leave
end

line = despace(line)
if line = ''
  then
    line = alt

return line
/* ------=================------ */
stripUrl: procedure
line = arg(1)

if pos('://', line) > 0 then do
  parse var line head proto '://' domain '/' tail

  if proto = '' then do
    proto = head
    head = ''
  end

  p = lastpos('/', tail)
  tail = substr(tail, p + 1)

  line = head domain tail
  line = despace(line)
end

line = strip(line)


return line
/* ------=================------ */
deleteQuotes: procedure
line = arg(1)

do forever
  p = pos('"', line)
  if p > 0 then
    line = delstr(line, p, 1)
  else
    leave
end


return line
/* ------=================------ */
stripEOL: procedure expose opt.
line = arg(1)

do forever
  p = pos(opt.EOL, line)
  if p = 0
    then leave
  parse var line first (opt.EOL) last
  line = first || last
end


return line
/* ------=================------ */
outLine: procedure expose (expList)
url = arg(1)
text = arg(2)

text = despace(text)
text = strip(text)

p = pos('://', url)
parse var url proto '://' .
proto = translate(proto)

u = translate(url)

if pos('MAILTO:', u) = 1
then
  return

if opt.urlsonly then do
  say url
  return
end

if p > 0 &,
 ((proto = 'HTTP') | (proto = 'HTTPS')  | (proto = 'FTP') |,
  (proto = 'IRC')  | (proto = 'GOPHER') |,
  (proto = 'WAIS') | (proto = 'NEWS'))
then do
  /* External link */
  call outUrl url, text
end
else do
  if text = 'HotLog' then
     pull .
  if pos('#', url) \= 1 then do
    /* A link local to current site, not to a page */
    if text = '' then text = url
    call outFolder url, text
  end
end


return
/* ------=================------ */
outUrl: procedure expose (expList)
url  = arg(1)
text = arg(2)

objCount = objCount + 1

text = makeObjectName(text)

opts.name     = text
opts.class    = 'WPUrl'
opts.opt      = 'UPDATE'

opts.objId = makeObjectId(text)

opts.setup = 'TITLE='text';URL='url ||,
             ';OBJECTID='opts.objId';'

call rcOut


return
/* ------=================------ */
outFolder: procedure expose (expList)
url  = arg(1)
text = arg(2)

objCount = objCount + 1

text = makeObjectName(text)

opts.name  = text
opts.class = 'WPUrlFolder'
opts.opt   = 'UPDATE'

opts.objId = makeObjectId(text)

opts.setup = 'TITLE='text';SHOWALLINTREEVIEW=YES' ||,
             ';OBJECTID='opts.objId';'

call rcOut

call openFolder

if url = '' then return

/* Whether to do the next level of recursion */
opt.nestlevel = opt.nestlevel - 1
if opt.nestlevel \= 0 then do
  if opt.queflag then do
    /* A page on the web */
    v = opt.url
    u = url
    p = lastpos(opt.url_slash, v)
    if p > 0 then v = delstr(v, p + 1)
    drop url
    opt.url = v || u
    call processPage ''
  end
  else do
    /* A page on a disk */
    u = translate(url, opt.fs_slash, opt.url_slash)
    u = opt.dir || opt.fs_slash || u
    p = lastpos(opt.fs_slash, u)
    if p > 0 then opt.dir = delstr(u, p)
    u = stream(u, 'c', 'query exists')
    if  u \= '' then
      call processPage u
  end
end
opt.nestlevel = opt.nestlevel + 1

call closeFolder


return
/* ------=================------ */
openFolder: procedure expose (expList)

opts.objId = MakeObjectId()

oq = RxQueue('SET', stack)
push opts.location
call RxQueue 'SET', oq
opts.location = opts.objId


return
/* ------=================------ */
closeFolder: procedure expose (expList)

oq = RxQueue('SET', stack)
parse pull opts.location
call RxQueue 'SET', oq


return
/* ------=================------ */
makeObjectName: procedure expose (expList)
text = arg(1)


do forever
  parse var text first '&' symbol ';' last
  if symbol = '' then
    leave
  do i = 1 to sym.0
    if symbol = sym.i then do
      symbol = sym.i.val
      leave
    end
  end
  text = first || symbol || last
end

text = translate(text, '|`', ';"')
text = deleteQuotes(text)
text = stripUrl(text)
text = stripEOL(text)
text = strip(text)


return text
/* ------=================------ */
makeObjectId: procedure expose (expList)
text = arg(1)

/* Prefixes for object ids */

/* For Url */
prefix.url     = '<MY_URL_'
/* For Url Folder */
prefix.urlf    = '<MY_URLF_'
/* Default prefix */
prefix.other   = '<MY_'

select
  when opts.class = 'WPUrl' then
    prefix = prefix.url

  when opts.class = 'WPUrlFolder' then
    prefix = prefix.urlf

  otherwise
    prefix = prefix.other
end

id = prefix || objCount || '>'


return id
/* ------=================------ */
rcOut: procedure expose opts.

app = '"PM_InstallObject"'
key = '"' || opts.name     || ';' || opts.class ||,
      ';' || opts.location || ';' || opts.opt   || '"'

opts.setup = strip(opts.setup)
opts.setup = strip(opts.setup, 'L', ';')

val = '"'opts.setup'"'

line = '  'app'    'key'   'val

say line


return
/* ------=================------ */
despace: procedure
line = arg(1)

/* Remove double spaces
   from the line        */

do forever
  p = pos('  ', line)
  if p > 0 then line = delstr(line, p, 1)
  else
    leave
end


return line
/* ------=================------ */
initRxDlls:

call RxFuncAdd 'SysLoadFuncs', 'rexxutil', 'SysLoadFuncs'
call SysLoadFuncs

call RxFuncAdd 'UniLoadFuncs', 'rexxuni', 'UniLoadFuncs'
call UniLoadFuncs


return
/* ------=================------ */
exposeSymTable: procedure expose sym.

sym.0 = 8
sym.1 = 'nbsp';      sym.1.val = ' '
sym.2 = 'laquo';     sym.2.val = '`'
sym.3 = 'raquo';     sym.3.val = ''''
sym.4 = 'amp';       sym.4.val = '&'
sym.5 = 'quot';      sym.5.val = '"'
sym.6 = 'lt';        sym.6.val = '<'
sym.7 = 'gt';        sym.7.val = '>'
sym.8 = 'hellip';    sym.8.val = ''


return
/* ------=================------ */
giveHelp:

call lineout 'stderr',,
   'Url grabber' || opt.EOL                                                    ||,
   '(c) 2006 valerius' || opt.EOL                                              ||,
   'Syntax:' || opt.EOL                                                        ||,
   '' || opt.EOL                                                               ||,
   'graburls [-h | [-r <nestlevel>] [-http <url> | <file.html>] ] ' || opt.EOL ||,
   'where:' || opt.EOL                                                         ||,
   '-h             -- output this help screen' || opt.EOL                      ||,
   '-r <nestlevel> -- recursion level' || opt.EOL                              ||,
   '-u             -- output urls only, not .rc file lines'                    ||,
   '-http <url.html> or' || opt.EOL                                            ||,
   '<file.html>    -- whether to parse a page on a web or on disk' || opt.EOL  ||,
   '' || opt.EOL


return
/* ------=================------ */
