smarter proxy specs; multiple proxy servers

This commit is contained in:
David Rose 2003-04-26 01:38:54 +00:00
parent d5b915eaad
commit ba755587b7
6 changed files with 530 additions and 48 deletions

View File

@ -41,6 +41,7 @@ HTTPChannel::
HTTPChannel(HTTPClient *client) :
_client(client)
{
_proxy_next_index = 0;
_persistent_connection = false;
_connect_timeout = connect_timeout;
_http_timeout = http_timeout;
@ -62,7 +63,6 @@ HTTPChannel(HTTPClient *client) :
_status_code = 0;
_status_string = string();
_response_type = RT_none;
_proxy = _client->get_proxy();
_http_version = _client->get_http_version();
_http_version_string = _client->get_http_version_string();
_state = S_new;
@ -254,7 +254,10 @@ run() {
bool repeat_later;
do {
if (_bio.is_null()) {
// If we're in a state that expects to have a connection already
// (that is, any state other that S_try_next_proxy), then
// reestablish the connection if it has been dropped.
if (_bio.is_null() && _state != S_try_next_proxy) {
if (_connect_count > http_max_connect_count) {
// Too many connection attempts, just give up. We should
// never trigger this failsafe, since the code in each
@ -267,8 +270,6 @@ run() {
}
// No connection. Attempt to establish one.
_proxy = _client->get_proxy();
if (_proxy.empty()) {
_bio = new BioPtr(_request.get_url());
} else {
@ -297,6 +298,10 @@ run() {
}
switch (_state) {
case S_try_next_proxy:
repeat_later = run_try_next_proxy();
break;
case S_connecting:
repeat_later = run_connecting();
break;
@ -626,6 +631,32 @@ reached_done_state() {
}
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::run_try_next_proxy
// Access: Private
// Description: This state is reached when a previous connection
// attempt fails. If we have multiple proxies in line
// to try, it sets us up for the next proxy and tries to
// connect again; otherwise, it sets the state to
// S_failure.
////////////////////////////////////////////////////////////////////
bool HTTPChannel::
run_try_next_proxy() {
if (_proxy_next_index < _proxies.size()) {
// Try the next proxy in sequence.
_proxy = _proxies[_proxy_next_index];
_proxy_auth = (HTTPAuthorization *)NULL;
_proxy_next_index++;
close_connection();
_state = S_connecting;
return false;
}
// No more proxies to try, or we're not using a proxy.
_state = S_failure;
return false;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::run_connecting
// Access: Private
@ -647,7 +678,7 @@ run_connecting() {
#ifdef REPORT_OPENSSL_ERRORS
ERR_print_errors_fp(stderr);
#endif
_state = S_failure;
_state = S_try_next_proxy;
return false;
}
@ -680,7 +711,7 @@ run_connecting_wait() {
if (fd < 0) {
downloader_cat.warning()
<< "nonblocking socket BIO has no file descriptor.\n";
_state = S_failure;
_state = S_try_next_proxy;
return false;
}
@ -706,7 +737,7 @@ run_connecting_wait() {
if (errcode < 0) {
downloader_cat.warning()
<< "Error in select.\n";
_state = S_failure;
_state = S_try_next_proxy;
return false;
}
@ -719,7 +750,7 @@ run_connecting_wait() {
downloader_cat.info()
<< "Timeout connecting to "
<< _request.get_url().get_server_and_port() << ".\n";
_state = S_failure;
_state = S_try_next_proxy;
return false;
}
return true;
@ -778,7 +809,7 @@ run_proxy_request_sent() {
// Huh, the proxy hung up on us as soon as we tried to connect.
if (_response_type == RT_hangup) {
// This was our second immediate hangup in a row. Give up.
_state = S_failure;
_state = S_try_next_proxy;
} else {
// Try again, once.
@ -851,7 +882,7 @@ run_proxy_reading_header() {
_status_code += 1000;
}
_state = S_failure;
_state = S_try_next_proxy;
return false;
}
@ -1038,7 +1069,7 @@ run_request_sent() {
// Huh, the server hung up on us as soon as we tried to connect.
if (_response_type == RT_hangup) {
// This was our second immediate hangup in a row. Give up.
_state = S_failure;
_state = S_try_next_proxy;
} else {
// Try again, once.
@ -1050,7 +1081,7 @@ run_request_sent() {
// Time to give up.
downloader_cat.info()
<< "Timeout waiting for " << _request.get_url().get_server_and_port() << ".\n";
_state = S_failure;
_state = S_try_next_proxy;
}
return true;
@ -1085,7 +1116,7 @@ run_reading_header() {
<< "Connection lost while reading HTTP response.\n";
if (_response_type == RT_http_hangup) {
// This was our second hangup in a row. Give up.
_state = S_failure;
_state = S_try_next_proxy;
} else {
// Try again, once.
@ -1097,7 +1128,7 @@ run_reading_header() {
// Time to give up.
downloader_cat.info()
<< "Timeout waiting for " << _request.get_url().get_server_and_port() << ".\n";
_state = S_failure;
_state = S_try_next_proxy;
}
return true;
}
@ -1250,6 +1281,19 @@ run_reading_header() {
}
}
if (_state == S_read_header &&
((get_status_code() / 100) == 5 || get_status_code() == 407) &&
!_proxy.empty() && !_proxy_tunnel && _proxy_next_index < _proxies.size()) {
// If we were using a proxy (but not tunneling through the proxy)
// and we got some kind of a server error, try the next proxy in
// sequence (if we have one). This handles the case of a working
// proxy that cannot see the host (and so returns 504 or something
// along those lines).
_state = S_try_next_proxy;
return false;
}
// Otherwise, we're good to go.
return false;
}
@ -1534,13 +1578,37 @@ begin_request(HTTPEnum::Method method, const DocumentSpec &url,
size_t first_byte, size_t last_byte) {
reset_for_new_request();
// Changing the proxy, or the nonblocking state, is grounds for
// dropping the old connection, if any.
if (_proxy != _client->get_proxy()) {
_proxy = _client->get_proxy();
// Get the set of proxies that are appropriate for this URL.
_proxies.clear();
_proxy_next_index = 0;
_client->get_proxies_for_url(url.get_url(), _proxies);
// If we still have a live connection to a proxy that is on the
// list, that proxy should be moved immediately to the front of the
// list (to minimize restarting connections unnecessarily).
if (!_bio.is_null() && !_proxies.empty() && !_proxy.empty()) {
Proxies::iterator pi = find(_proxies.begin(), _proxies.end(), _proxy);
if (pi != _proxies.end()) {
_proxies.erase(pi);
_proxies.insert(_proxies.begin(), _proxy);
}
}
URLSpec new_proxy;
if (_proxy_next_index < _proxies.size()) {
new_proxy = _proxies[_proxy_next_index];
_proxy_next_index++;
}
// Changing the proxy is grounds for dropping the old connection, if
// any.
if (_proxy != new_proxy) {
_proxy = new_proxy;
_proxy_auth = (HTTPAuthorization *)NULL;
reset_to_new();
}
// Ditto with changing the nonblocking state.
if (_nonblocking != nonblocking) {
_nonblocking = nonblocking;
reset_to_new();
@ -1818,7 +1886,7 @@ parse_http_response(const string &line) {
_status_string = "Not an HTTP response";
if (_response_type == RT_non_http) {
// This was our second non-HTTP response in a row. Give up.
_state = S_failure;
_state = S_try_next_proxy;
} else {
// Maybe we were just in some bad state. Drop the connection

View File

@ -150,6 +150,7 @@ public:
private:
bool reached_done_state();
bool run_try_next_proxy();
bool run_connecting();
bool run_connecting_wait();
bool run_proxy_ready();
@ -204,7 +205,11 @@ private:
void reset_to_new();
void close_connection();
typedef pvector<URLSpec> Proxies;
HTTPClient *_client;
Proxies _proxies;
size_t _proxy_next_index;
URLSpec _proxy;
PT(BioPtr) _bio;
PT(BioStreamPtr) _source;
@ -286,6 +291,7 @@ private:
// off.
enum State {
S_new,
S_try_next_proxy,
S_connecting,
S_connecting_wait,
S_proxy_ready,

View File

@ -17,28 +17,6 @@
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_proxy
// Access: Published
// Description: Specifies the proxy URL to handle all http and
// https requests.
////////////////////////////////////////////////////////////////////
INLINE void HTTPClient::
set_proxy(const URLSpec &proxy) {
_proxy = proxy;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxy
// Access: Published
// Description: Returns the proxy URL to handle all http and
// https requests.
////////////////////////////////////////////////////////////////////
INLINE const URLSpec &HTTPClient::
get_proxy() const {
return _proxy;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_http_version
// Access: Published

View File

@ -44,6 +44,52 @@ bool HTTPClient::_ssl_initialized = false;
X509_STORE *HTTPClient::_x509_store = NULL;
////////////////////////////////////////////////////////////////////
// Function: trim_blanks
// Description:
////////////////////////////////////////////////////////////////////
static string
trim_blanks(const string &str) {
size_t start = 0;
while (start < str.length() && isspace(str[start])) {
start++;
}
size_t end = str.length();
while (end > start && isspace(str[end - 1])) {
end--;
}
return str.substr(start, end - start);
}
////////////////////////////////////////////////////////////////////
// Function: tokenize
// Description: Chops the source string up into pieces delimited by
// any of the characters specified in delimiters.
// Repeated delimiter characters represent zero-length
// tokens.
//
// It is the user's responsibility to ensure the output
// vector is cleared before calling this function; the
// results will simply be appended to the end of the
// vector.
////////////////////////////////////////////////////////////////////
static void
tokenize(const string &str, vector_string &words, const string &delimiters) {
size_t p = 0;
while (p < str.length()) {
size_t q = str.find_first_of(delimiters, p);
if (q == string::npos) {
words.push_back(str.substr(p));
return;
}
words.push_back(str.substr(p, q - p));
p = q + 1;
}
words.push_back(string());
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::Constructor
// Access: Published
@ -55,7 +101,7 @@ HTTPClient() {
_verify_ssl = verify_ssl ? VS_normal : VS_no_verify;
_ssl_ctx = (SSL_CTX *)NULL;
_proxy = URLSpec(http_proxy, 1);
set_proxy_spec(http_proxy);
if (!http_proxy_username.empty()) {
set_username("*proxy", "", http_proxy_username);
}
@ -104,7 +150,8 @@ HTTPClient(const HTTPClient &copy) {
////////////////////////////////////////////////////////////////////
void HTTPClient::
operator = (const HTTPClient &copy) {
_proxy = copy._proxy;
_proxies_by_scheme = copy._proxies_by_scheme;
_direct_hosts = copy._direct_hosts;
_http_version = copy._http_version;
_verify_ssl = copy._verify_ssl;
_usernames = copy._usernames;
@ -140,6 +187,328 @@ HTTPClient::
clear_expected_servers();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_proxy
// Access: Published
// Description: Specifies the proxy URL to handle all http and
// https requests. Deprecated.
////////////////////////////////////////////////////////////////////
void HTTPClient::
set_proxy(const URLSpec &proxy) {
set_proxy_spec(proxy.get_url());
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxy
// Access: Published
// Description: Returns the proxy URL to handle all http and
// https requests. Deprecated.
////////////////////////////////////////////////////////////////////
URLSpec HTTPClient::
get_proxy() const {
pvector<URLSpec> proxies;
get_proxies_for_url(URLSpec("http://"), proxies);
if (!proxies.empty()) {
return proxies[0];
}
return URLSpec();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_proxy_spec
// Access: Published
// Description: Specifies the complete set of proxies to use for all
// schemes. This is either a semicolon-delimited set of
// hostname:ports, or a semicolon-delimited set of pairs
// of the form "scheme=hostname:port", or a combination.
// A particular scheme and/or proxy host may be listed
// more than once. This is a convenience function that
// can be used in place of explicit calls to add_proxy()
// for each scheme/proxy pair.
////////////////////////////////////////////////////////////////////
void HTTPClient::
set_proxy_spec(const string &proxy_spec) {
clear_proxy();
// Tokenize the string based on the semicolons.
vector_string proxies;
tokenize(proxy_spec, proxies, ";");
for (vector_string::const_iterator pi = proxies.begin();
pi != proxies.end();
++pi) {
const string &spec = (*pi);
// Divide out the scheme and the hostname.
string scheme;
URLSpec url;
size_t equals = spec.find('=');
if (equals == string::npos) {
scheme = "";
url = URLSpec(spec, true);
} else {
scheme = trim_blanks(spec.substr(0, equals));
url = URLSpec(spec.substr(equals + 1), true);
}
if (!url.has_scheme()) {
// The default scheme for talking to proxies is HTTP.
url.set_scheme("http");
}
add_proxy(scheme, url);
}
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxy_spec
// Access: Published
// Description: Returns the complete set of proxies to use for all
// schemes. This is a string of the form specified by
// set_proxy_spec(), above. Note that the string
// returned by this function may not be exactly the same
// as the string passed into set_proxy_spec(), since the
// string is regenerated from the internal storage
// structures and may therefore be reordered.
////////////////////////////////////////////////////////////////////
string HTTPClient::
get_proxy_spec() const {
string result;
ProxiesByScheme::const_iterator si;
for (si = _proxies_by_scheme.begin(); si != _proxies_by_scheme.end(); ++si) {
const string &scheme = (*si).first;
const Proxies &proxies = (*si).second;
Proxies::const_iterator pi;
for (pi = proxies.begin(); pi != proxies.end(); ++pi) {
const URLSpec &url = (*pi);
if (!result.empty()) {
result += ";";
}
if (!scheme.empty()) {
result += scheme;
result += "=";
}
result += url.get_url();
}
}
return result;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_direct_host_spec
// Access: Published
// Description: Specifies the set of hosts that should be connected
// to directly, without using a proxy. This is a
// semicolon-separated list of hostnames or ip addresses,
// that may contain wildcard characters ("*").
////////////////////////////////////////////////////////////////////
void HTTPClient::
set_direct_host_spec(const string &direct_host_spec) {
clear_direct_host();
// Tokenize the string based on the semicolons.
vector_string hosts;
tokenize(direct_host_spec, hosts, ";");
for (vector_string::const_iterator hi = hosts.begin();
hi != hosts.end();
++hi) {
const string &spec = (*hi);
add_direct_host(trim_blanks(spec));
}
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_direct_host_spec
// Access: Published
// Description: Returns the set of hosts that should be connected
// to directly, without using a proxy, as a
// semicolon-separated list of hostnames or ip addresses,
// that may contain wildcard characters ("*").
////////////////////////////////////////////////////////////////////
string HTTPClient::
get_direct_host_spec() const {
string result;
vector_string::const_iterator si;
for (si = _direct_hosts.begin(); si != _direct_hosts.end(); ++si) {
const string &host = (*si);
if (!result.empty()) {
result += ";";
}
result += host;
}
return result;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::clear_proxy
// Access: Published
// Description: Resets the proxy spec to empty. Subsequent calls to
// add_proxy() may be made to build up the set of proxy
// servers.
////////////////////////////////////////////////////////////////////
void HTTPClient::
clear_proxy() {
_proxies_by_scheme.clear();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::add_proxy
// Access: Published
// Description: Adds the indicated proxy host as a proxy for
// communications on the given scheme. Usually the
// scheme is "http" or "https". It may be the empty
// string to indicate a general proxy.
////////////////////////////////////////////////////////////////////
void HTTPClient::
add_proxy(const string &scheme, const URLSpec &proxy) {
// The scheme is always converted to lowercase.
string lc_scheme;
lc_scheme.reserve(scheme.length());
for (string::const_iterator si = scheme.begin(); si != scheme.end(); ++si) {
lc_scheme += tolower(*si);
}
// Remove the trailing colon, if there is one.
if (!lc_scheme.empty() && lc_scheme[lc_scheme.length() - 1] == ':') {
lc_scheme = lc_scheme.substr(0, lc_scheme.length() - 1);
}
_proxies_by_scheme[lc_scheme].push_back(proxy);
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::clear_direct_host
// Access: Published
// Description: Resets the set of direct hosts to empty. Subsequent
// calls to add_direct_host() may be made to build up
// the list of hosts that do not require a proxy
// connection.
////////////////////////////////////////////////////////////////////
void HTTPClient::
clear_direct_host() {
_direct_hosts.clear();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::add_direct_host
// Access: Published
// Description: Adds the indicated name to the set of hostnames that
// are connected to directly, without using a proxy.
// This name may be either a DNS name or an IP address,
// and it may include the * as a wildcard character.
////////////////////////////////////////////////////////////////////
void HTTPClient::
add_direct_host(const string &hostname) {
// The hostname is always converted to lowercase.
string lc_hostname;
lc_hostname.reserve(hostname.length());
for (string::const_iterator si = hostname.begin();
si != hostname.end();
++si) {
lc_hostname += tolower(*si);
}
_direct_hosts.push_back(lc_hostname);
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxies_for_url
// Access: Published
// Description: Fills up the indicated vector with the list of
// URLSpec objects, in the order in which they should be
// tried, that are appropriate for the indicated URL.
// The vector is left empty if a direct connection
// should be used.
//
// It is the user's responsibility to empty this vector
// before calling this method; otherwise, the proxy
// URL's will simply be appended to the existing list.
////////////////////////////////////////////////////////////////////
void HTTPClient::
get_proxies_for_url(const URLSpec &url, pvector<URLSpec> &proxies) const {
// First, check if the hostname matches any listed in direct_hosts.
string hostname = url.get_server();
// TODO: This should be a glob match, not a literal match.
vector_string::const_iterator si;
for (si = _direct_hosts.begin(); si != _direct_hosts.end(); ++si) {
if ((*si) == hostname) {
// It matches, so don't use any proxies.
return;
}
}
// Now choose the appropriate proxy based on the scheme.
string scheme = url.get_scheme();
bool got_any = false;
if (scheme.empty()) {
// An empty scheme implies we will want to make a direct
// connection to this host, so we will need a socks-style or
// https-style scheme.
if (get_proxies_for_scheme("socks", proxies)) {
got_any = true;
}
if (get_proxies_for_scheme("https", proxies)) {
got_any = true;
}
} else {
// Otherwise, try to match the proxy to the scheme.
if (get_proxies_for_scheme(scheme, proxies)) {
got_any = true;
}
}
// If we didn't find our scheme of choice, fall back to the default
// proxy type.
if (!got_any) {
if (get_proxies_for_scheme("", proxies)) {
got_any = true;
}
}
// And failing that, try the http proxy.
if (!got_any) {
get_proxies_for_scheme("http", proxies);
}
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxies_for_url
// Access: Published
// Description: Returns a semicolon-delimited list of proxies, in the
// order in which they should be tried, that are
// appropriate for the indicated URL. The empty string
// is returned if a direct connection should be used.
////////////////////////////////////////////////////////////////////
string HTTPClient::
get_proxies_for_url(const URLSpec &url) const {
pvector<URLSpec> proxies;
get_proxies_for_url(url, proxies);
string result;
if (!proxies.empty()) {
pvector<URLSpec>::const_iterator pi = proxies.begin();
result += (*pi).get_url();
++pi;
while (pi != proxies.end()) {
result += ";";
result += (*pi).get_url();
++pi;
}
}
return result;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_username
// Access: Published
@ -467,6 +836,32 @@ get_ssl_ctx() {
return _ssl_ctx;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxies_for_scheme
// Access: Private
// Description: Adds the proxy servers associated with the indicated
// scheme, if any, to the list. Returns true if any
// were added, false otherwise.
////////////////////////////////////////////////////////////////////
bool HTTPClient::
get_proxies_for_scheme(const string &scheme, pvector<URLSpec> &proxies) const {
ProxiesByScheme::const_iterator si = _proxies_by_scheme.find(scheme);
if (si == _proxies_by_scheme.end()) {
return false;
}
const Proxies &scheme_proxies = (*si).second;
if (scheme_proxies.empty()) {
return false;
}
Proxies::const_iterator pi;
for (pi = scheme_proxies.begin(); pi != scheme_proxies.end(); ++pi) {
proxies.push_back(*pi);
}
return true;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::add_http_username
// Access: Private

View File

@ -32,6 +32,9 @@
#include "httpAuthorization.h"
#include "httpEnum.h"
#include "pointerTo.h"
#include "pvector.h"
#include "pmap.h"
#include "vector_string.h"
#include <openssl/ssl.h>
@ -59,8 +62,22 @@ PUBLISHED:
void operator = (const HTTPClient &copy);
~HTTPClient();
INLINE void set_proxy(const URLSpec &proxy);
INLINE const URLSpec &get_proxy() const;
void set_proxy(const URLSpec &proxy);
URLSpec get_proxy() const;
void set_proxy_spec(const string &proxy_spec);
string get_proxy_spec() const;
void set_direct_host_spec(const string &direct_host_spec);
string get_direct_host_spec() const;
void clear_proxy();
void add_proxy(const string &scheme, const URLSpec &proxy);
void clear_direct_host();
void add_direct_host(const string &hostname);
void get_proxies_for_url(const URLSpec &url, pvector<URLSpec> &proxies) const;
string get_proxies_for_url(const URLSpec &url) const;
void set_username(const string &server, const string &realm, const string &username);
string get_username(const string &server, const string &realm) const;
@ -93,6 +110,9 @@ public:
SSL_CTX *get_ssl_ctx();
private:
bool get_proxies_for_scheme(const string &scheme,
pvector<URLSpec> &proxies) const;
void add_http_username(const string &http_username);
string select_username(const URLSpec &url, bool is_proxy,
const string &realm) const;
@ -113,7 +133,11 @@ private:
void *arg);
#endif
URLSpec _proxy;
typedef pvector<URLSpec> Proxies;
typedef pmap<string, Proxies> ProxiesByScheme;
ProxiesByScheme _proxies_by_scheme;
vector_string _direct_hosts;
HTTPEnum::HTTPVersion _http_version;
VerifySSL _verify_ssl;

View File

@ -447,7 +447,19 @@ set_query(const string &query) {
////////////////////////////////////////////////////////////////////
void URLSpec::
set_url(const string &url, bool server_name_expected) {
_url = url;
size_t p, q;
// Omit leading and trailing whitespace.
p = 0;
while (p < url.length() && isspace(url[p])) {
p++;
}
q = url.length();
while (q > p && isspace(url[q - 1])) {
q--;
}
_url = url.substr(p, q - p);
_flags = 0;
if (url.empty()) {
@ -457,7 +469,6 @@ set_url(const string &url, bool server_name_expected) {
// First, replace backslashes with forward slashes, since this is a
// common mistake among Windows users.
size_t p;
for (p = 0; p < _url.length(); p++) {
if (_url[p] == '\\') {
_url[p] = '/';