2009-01-26 20:12:53 +00:00
|
|
|
// $Id: fetch.c,v 1.42 2008/12/10 22:53:33 lynx Exp $ // vim:syntax=lpc
|
2009-01-26 19:21:29 +00:00
|
|
|
//
|
|
|
|
// generic HTTP GET client, mostly used for RSS -
|
|
|
|
// but we could fetch any page or data with it, really
|
2009-04-18 06:09:05 +00:00
|
|
|
// tobij even allowed the object to have the URL as its object name. fancy! ;)
|
|
|
|
|
2009-04-20 06:18:13 +00:00
|
|
|
#ifdef Dfetch
|
|
|
|
# undef DEBUG
|
|
|
|
# define DEBUG Dfetch
|
|
|
|
#endif
|
|
|
|
|
2009-01-26 19:21:29 +00:00
|
|
|
#include <ht/http.h>
|
2009-01-26 20:12:53 +00:00
|
|
|
#include <net.h>
|
2009-03-03 23:40:26 +00:00
|
|
|
#include <uniform.h>
|
2009-01-26 19:21:29 +00:00
|
|
|
#include <services.h>
|
|
|
|
|
|
|
|
virtual inherit NET_PATH "output"; // virtual: in case we get inherited..
|
|
|
|
inherit NET_PATH "connect";
|
|
|
|
//inherit NET_PATH "place/master";
|
|
|
|
|
|
|
|
#ifdef NEW_QUEUE
|
|
|
|
inherit NET_PATH "queue2";
|
|
|
|
#else
|
|
|
|
inherit NET_PATH "queue";
|
|
|
|
#endif
|
|
|
|
|
|
|
|
volatile mapping headers, fheaders;
|
|
|
|
volatile string modificationtime, etag, http_message;
|
|
|
|
volatile string useragent = SERVER_VERSION;
|
|
|
|
volatile int http_status, port, fetching, ssl;
|
|
|
|
volatile string buffer, thehost, url, fetched, host, resource;
|
2009-04-18 06:09:05 +00:00
|
|
|
volatile string basicauth = "";
|
2009-01-26 19:21:29 +00:00
|
|
|
|
|
|
|
int parse_status(string all);
|
|
|
|
int parse_header(string all);
|
|
|
|
int buffer_content(string all);
|
|
|
|
|
|
|
|
string qHost() { return thehost; }
|
|
|
|
|
|
|
|
void fetch(string murl) {
|
2009-04-22 03:47:45 +00:00
|
|
|
if (url != murl) {
|
|
|
|
// accept.c does this for us:
|
|
|
|
//url = replace(murl, ":/", "://");
|
|
|
|
// so we can use this method also in a normal way
|
|
|
|
url = murl;
|
|
|
|
// resource may need to be re-parsed (other params)
|
|
|
|
resource = 0;
|
|
|
|
// re-parse the hostname?
|
|
|
|
//thehost = port = 0;
|
|
|
|
}
|
2009-04-18 06:09:05 +00:00
|
|
|
P3(("%O: fetch(%O)\n", ME, url))
|
2009-04-22 03:47:45 +00:00
|
|
|
unless (fetching) connect();
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
object load() { return ME; }
|
|
|
|
|
2009-04-18 06:09:05 +00:00
|
|
|
void sAuth(string user, string password) {
|
|
|
|
basicauth = "Authorization: Basic "+
|
|
|
|
encode_base64(user +":"+ password) +"\r\n";
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
2009-04-18 06:09:05 +00:00
|
|
|
string sAgent(string a) { return useragent = a; }
|
2009-01-26 19:21:29 +00:00
|
|
|
|
|
|
|
// net/place/news code follows.
|
|
|
|
|
|
|
|
void connect() {
|
|
|
|
mixed t;
|
|
|
|
|
|
|
|
fetching = 1;
|
|
|
|
ssl = 0;
|
|
|
|
unless (thehost) {
|
|
|
|
unless (sscanf(url, "http%s://%s/%!s", t, thehost)) {
|
|
|
|
P0(("%O couldn't parse %O\n", ME, url))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
thehost = lower_case(thehost);
|
|
|
|
ssl = t == "s";
|
|
|
|
}
|
2009-04-20 06:18:13 +00:00
|
|
|
P4(("URL, THEHOST: %O, %O\n", url, thehost))
|
2009-01-26 19:21:29 +00:00
|
|
|
unless (port)
|
|
|
|
unless (sscanf(thehost, "%s:%d", thehost, port) == 2)
|
|
|
|
port = ssl? HTTPS_SERVICE: HTTP_SERVICE;
|
|
|
|
P2(("Resolving %O and connecting.\n", thehost))
|
|
|
|
::connect(thehost, port);
|
|
|
|
}
|
|
|
|
|
2009-04-20 06:18:13 +00:00
|
|
|
varargs int real_logon(int failure) {
|
2009-01-26 19:21:29 +00:00
|
|
|
string scheme;
|
|
|
|
|
|
|
|
headers = ([ ]);
|
|
|
|
http_status = 500;
|
|
|
|
http_message = "(failure)"; // used by debug only
|
|
|
|
|
2009-04-20 06:18:13 +00:00
|
|
|
unless(::logon(failure)) return -1;
|
2009-01-26 19:21:29 +00:00
|
|
|
unless (url) return -3;
|
|
|
|
unless (resource) sscanf(url, "%s://%s/%s", scheme, host, resource);
|
|
|
|
|
2009-04-18 06:09:05 +00:00
|
|
|
buffer = basicauth;
|
2009-01-26 19:21:29 +00:00
|
|
|
if (modificationtime)
|
|
|
|
buffer += "If-Modified-Since: "+ modificationtime + "\r\n";
|
|
|
|
if (useragent) buffer += "User-Agent: "+ useragent +"\r\n";
|
|
|
|
//if (etag)
|
|
|
|
// emit("If-None-Match: " + etag + "\r\n");
|
|
|
|
// we won't need connection: close w/ http/1.0
|
|
|
|
//emit("Connection: close\r\n\r\n");
|
2009-10-15 11:51:26 +00:00
|
|
|
P2(("%O fetching /%s from %O\n", ME, resource, host))
|
2009-04-20 06:18:13 +00:00
|
|
|
P4(("%O using %O\n", ME, buffer))
|
2009-04-18 06:10:50 +00:00
|
|
|
emit("GET /"+ resource +" HTTP/1.0\r\n"
|
2009-01-26 19:21:29 +00:00
|
|
|
"Host: "+ host +"\r\n"
|
|
|
|
+ buffer +
|
|
|
|
"\r\n");
|
|
|
|
|
|
|
|
buffer = "";
|
|
|
|
next_input_to(#'parse_status);
|
|
|
|
return 0; // duh.
|
|
|
|
}
|
|
|
|
|
2009-04-20 06:18:13 +00:00
|
|
|
varargs int logon(int failure, int sub) {
|
2009-01-26 20:12:53 +00:00
|
|
|
// net/connect disables telnet for all robots and circuits
|
|
|
|
#if 0 //__EFUN_DEFINED__(enable_telnet)
|
|
|
|
// when fetching the spiegel rss feed, telnet_neg() occasionally
|
|
|
|
// crashes. fixing that would be cool, but why have the telnet
|
|
|
|
// machine enabled at all?
|
|
|
|
enable_telnet(0);
|
|
|
|
#endif
|
2009-01-26 19:21:29 +00:00
|
|
|
// when called from xmlrpc.c we can't do TLS anyway
|
2009-04-20 06:18:13 +00:00
|
|
|
if (sub) return ::logon(failure);
|
2009-01-26 19:21:29 +00:00
|
|
|
if (ssl) tls_init_connection(ME, #'real_logon);
|
2009-04-20 06:18:13 +00:00
|
|
|
else real_logon(failure);
|
2009-01-26 19:21:29 +00:00
|
|
|
return 0; // duh.
|
|
|
|
}
|
|
|
|
|
|
|
|
int parse_status(string all) {
|
|
|
|
string prot;
|
|
|
|
string state;
|
|
|
|
|
|
|
|
sscanf(all, "%s%t%s", prot, state);
|
|
|
|
sscanf(state, "%d%t%s", http_status, http_message);
|
2009-01-26 20:12:53 +00:00
|
|
|
if (http_status != R_OK) {
|
2009-10-15 11:51:26 +00:00
|
|
|
P0(("%O got %O %O from %O\n", ME,
|
2009-04-20 06:18:13 +00:00
|
|
|
http_status, http_message, host));
|
2009-01-26 20:12:53 +00:00
|
|
|
monitor_report("_failure_unsupported_code_HTTP",
|
|
|
|
S("http/fetch'ing %O returned %O %O", url || ME,
|
|
|
|
http_status, http_message));
|
|
|
|
}
|
2009-01-26 19:21:29 +00:00
|
|
|
next_input_to(#'parse_header);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int parse_header(string all) {
|
|
|
|
string key, val;
|
|
|
|
// TODO: parse status code
|
|
|
|
if (all != "") {
|
2009-04-20 06:18:13 +00:00
|
|
|
P2(("http/fetch::parse_header %O\n", all))
|
2009-01-26 19:21:29 +00:00
|
|
|
if (sscanf(all, "%s:%1.0t%s", key, val) == 2) {
|
|
|
|
headers[lower_case(key)] = val;
|
|
|
|
// P2(("ht head: %O = %O\n", key, val))
|
|
|
|
}
|
|
|
|
next_input_to(#'parse_header);
|
|
|
|
return 1;
|
|
|
|
} else {
|
|
|
|
// das wollen wir nur bei status 200
|
2009-04-20 06:18:13 +00:00
|
|
|
P2(("%O now waiting for http body\n", ME))
|
2009-01-26 19:21:29 +00:00
|
|
|
next_input_to(#'buffer_content);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int buffer_content(string all) {
|
2009-04-20 06:18:13 +00:00
|
|
|
P2(("%O body %O\n", ME, all))
|
2009-01-26 19:21:29 +00:00
|
|
|
buffer += all + "\n";
|
|
|
|
next_input_to(#'buffer_content);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
disconnected(remainder) {
|
2009-04-20 06:18:13 +00:00
|
|
|
P2(("%O got disconnected.. %O\n", ME, remainder))
|
|
|
|
headers["_fetchtime"] = isotime(ctime(time()), 1);
|
|
|
|
if (headers["last-modified"])
|
2009-01-26 19:21:29 +00:00
|
|
|
modificationtime = headers["last-modified"];
|
2009-04-20 06:18:13 +00:00
|
|
|
if (headers["etag"])
|
2009-01-26 19:21:29 +00:00
|
|
|
etag = headers["etag"]; // heise does not work with etag
|
|
|
|
|
2009-04-20 06:18:13 +00:00
|
|
|
fetched = buffer;
|
|
|
|
if (remainder) fetched += remainder;
|
|
|
|
fheaders = headers;
|
|
|
|
buffer = headers = 0;
|
|
|
|
switch (http_status) {
|
|
|
|
case R_OK:
|
|
|
|
mixed *waiter;
|
|
|
|
while (qSize(ME)) {
|
|
|
|
waiter = shift(ME);
|
|
|
|
P2(("%O calls back.. body is %O\n", ME, fetched))
|
|
|
|
funcall(waiter[0], fetched, waiter[1] ? fheaders : copy(fheaders));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// doesn't seem to get here when HTTP returns 301 or 302. strange.
|
|
|
|
// fall thru
|
|
|
|
case R_NOTMODIFIED:
|
|
|
|
qDel(ME);
|
|
|
|
qInit(ME, 150, 5);
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
2009-04-20 06:18:13 +00:00
|
|
|
fetching = 0;
|
|
|
|
return 1; // presume this disc was expected
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
varargs string content(closure cb, int force, int willbehave) {
|
2009-04-20 06:18:13 +00:00
|
|
|
if (cb) {
|
|
|
|
if (fetched) {
|
|
|
|
if (force) {
|
|
|
|
funcall(cb, fetched, willbehave ? fheaders : copy(fheaders));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
enqueue(ME, ({ cb, willbehave }));
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
}
|
2009-04-20 06:18:13 +00:00
|
|
|
return fetched;
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
varargs mapping headers(int willbehave) {
|
2009-04-20 06:18:13 +00:00
|
|
|
return willbehave ? fheaders : copy(fheaders);
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
string qHeader(mixed key) {
|
2009-04-20 06:18:13 +00:00
|
|
|
if (mappingp(fheaders)) return fheaders[key];
|
|
|
|
return 0;
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
varargs void refetch(closure cb, int willbehave) {
|
2009-04-20 06:18:13 +00:00
|
|
|
enqueue(ME, ({ cb, willbehave }));
|
|
|
|
unless (fetching) connect();
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
protected create() {
|
2009-04-20 06:18:13 +00:00
|
|
|
qCreate();
|
|
|
|
qInit(ME, 150, 5);
|
2009-01-26 19:21:29 +00:00
|
|
|
}
|