Wget's --accept/--reject handling works by applying the supplied strings to the
filename part of the URL only (the part before the "?"). If, after downloading,
the accept/reject strings match the file name its stored in, the file will be
deleted. The match is performed on the end of the filename(?), this filename
includes the query part of the URL (the part after the "?"). If
--html-extension is used, the filename to which the page is saved is different
and the match never succeeds.

The man page and --help don't mention any of this.

This patch changes the matching. When deciding whether to download a file to
scan for any URLs it may contain, the matching is also applied to the end of
the query part of the (not %-decoded) part of the URL. This allows to
download/look at

   http://.../dir/somefile

but to never download

   http://.../dir/somefile?morestuff=1
   http://.../dir/somefile?evenmorethingsidontwant=yes

which is essential in some cases.

The problem really is twofold - decide whether to download something for its
URLs, and decide whether to download something ever.

Would it be a good idea to implement a regex accept/include against the whole
URL? %-decoded URL? A switch which says whether to apply exclusions to files of
type text/html as well? I might implement something if it's thought to be
useful.

For the record, this patch is against the source of wget shipped with SuSE 8.2,
which has several patches applied to it. In this case that should be
irrelevant.

Volker Kuhlmann, 6 Jul 2003
http://volker.dnsalias.net/soft/patch/wget-1.8.2-accrejquery.diff



--- wget-1.8.2/src/ftp.c.orig	2003-07-05 17:50:16.000000000 +1200
+++ wget-1.8.2/src/ftp.c	2003-07-05 17:50:16.000000000 +1200
@@ -1618,6 +1618,8 @@
 {
   struct fileinfo *orig, *start;
   uerr_t res;
+  struct fileinfo *f;
+
 
   con->cmd |= LEAVE_PENDING;
 
@@ -1629,8 +1631,7 @@
      opt.accepts and opt.rejects.  */
   if (opt.accepts || opt.rejects)
     {
-      struct fileinfo *f = orig;
-
+	f = orig;
       while (f)
 	{
 	  if (f->type != FT_DIRECTORY && !acceptable (f->name))
@@ -1642,6 +1643,18 @@
 	    f = f->next;
 	}
     }
+  /* Remove all files with possible harmful names */
+  f = orig;
+  while (f)
+  {
+     if (has_invalid_name(f->name))
+     {
+	  logprintf (LOG_VERBOSE, _("Rejecting `%s'.\n"), f->name);
+	  f = delelement (f, &start);
+     }
+     else
+	  f = f->next;
+  }
   /* Now weed out the files that do not match our globbing pattern.
      If we are dealing with a globbing pattern, that is.  */
   if (*u->file && (action == GLOBALL || action == GETONE))
--- wget-1.8.2/src/recur.c.orig	2002-05-28 02:00:06.000000000 +1200
+++ wget-1.8.2/src/recur.c	2003-07-05 21:19:28.000000000 +1200
@@ -427,6 +427,10 @@
   const char *url = u->url;
 
   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
+  DEBUGP (("  \\->  URL path '%s'\n", u->path));
+  DEBUGP (("  \\->  URL params '%s'\n", u->params));
+  DEBUGP (("  \\->  URL query '%s'\n", u->query));
+  DEBUGP (("  \\->  URL fragment '%s'\n", u->fragment));
 
   if (string_set_contains (blacklist, url))
     {
@@ -536,17 +540,26 @@
 	   - recursion is not infinite,
 	   - and we are at its very end. */
 
+    suf = suffix (url);
     if (u->file[0] != '\0'
-	&& ((suf = suffix (url)) == NULL
+	&& (   (suf == NULL)
 	    || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
-	    || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
+	    || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)
+	   )
+	)
       {
 	if (!acceptable (u->file))
 	  {
-	    DEBUGP (("%s (%s) does not match acc/rej rules.\n",
+	    DEBUGP (("%s (%s) doesn't match acc/matches rej rules.\n",
 		     url, u->file));
 	    goto out;
 	  }
+	if ((u->query) && !acceptable (u->query))
+	  {
+	    DEBUGP (("%s (%s) doesn't match acc/matches rej rules.\n",
+		     url, u->query));
+	    goto out;
+	  }
       }
   }
 
--- wget-1.8.2/src/utils.c.orig	2002-05-18 15:05:22.000000000 +1200
+++ wget-1.8.2/src/utils.c	2003-07-05 20:29:36.000000000 +1200
@@ -753,8 +753,16 @@
    If the BACKWARD is 0, don't do backward comparison -- just compare
    them normally.  */
 static int
-in_acclist (const char *const *accepts, const char *s, int backward)
+in_acclist_ (const char *const *accepts, const char *s, int backward)
 {
+  static int done=0;
+  const char *const *acc = accepts;
+  if (!done) {
+    done=1;
+    for (; *acc; acc++)
+      DEBUGP (("*** ACCEPTS/REJECTS: %s\n", *acc));
+  }
+    
   for (; *accepts; accepts++)
     {
       if (has_wildcards_p (*accepts))
@@ -780,6 +788,14 @@
     }
   return 0;
 }
+static int
+in_acclist (const char *const *accepts, const char *s, int backward)
+{
+  int ret = in_acclist_ (accepts, s, backward);
+  DEBUGP (("*** ACCLIST: ret=%d, backwards=%d '%s'\n",
+     ret, backward, s));
+  return ret;
+}
 
 /* Return the location of STR's suffix (file extension).  Examples:
    suffix ("foo.bar")       -> "bar"

