refine interface for determining file size of download

This commit is contained in:
David Rose 2003-08-12 21:07:59 +00:00
parent 191422d67e
commit ed6187271c
9 changed files with 191 additions and 68 deletions

View File

@ -17,6 +17,7 @@
////////////////////////////////////////////////////////////////////
#include "chunkedStreamBuf.h"
#include "config_downloader.h"
#include <ctype.h>
// This module is not compiled if OpenSSL is not available.
@ -77,7 +78,8 @@ open_read(BioStreamPtr *source, HTTPChannel *doc) {
if (_doc != (HTTPChannel *)NULL) {
_read_index = doc->_read_index;
_doc->_file_size = 0;
_doc->_transfer_file_size = 0;
_doc->_got_transfer_file_size = true;
// Read a little bit from the file to get the first chunk (and
// therefore the file size, or at least the size of the first
@ -180,9 +182,16 @@ read_chars(char *start, size_t length) {
return 0;
}
size_t chunk_size = (size_t)strtol(line.c_str(), NULL, 16);
if (downloader_cat.is_spam()) {
downloader_cat.spam()
<< "Got chunk of size " << chunk_size << " bytes.\n";
}
if (chunk_size == 0) {
// Last chunk; we're done.
_done = true;
_doc->_file_size = _doc->_transfer_file_size;
_doc->_got_file_size = true;
if (_doc != (HTTPChannel *)NULL && _read_index == _doc->_read_index) {
_doc->finished_body(true);
}
@ -190,7 +199,7 @@ read_chars(char *start, size_t length) {
}
if (_doc != (HTTPChannel *)NULL && _read_index == _doc->_read_index) {
_doc->_file_size += chunk_size;
_doc->_transfer_file_size += chunk_size;
}
_chunk_remaining = chunk_size;

View File

@ -88,8 +88,12 @@ const string http_proxy =
config_downloader.GetString("http-proxy", "");
const string http_direct_hosts =
config_downloader.GetString("http-direct-hosts", "");
const bool http_try_all_direct =
config_downloader.GetBool("http-try-all-direct", true);
const string http_proxy_username =
config_downloader.GetString("http-proxy-username", "");
const bool http_proxy_tunnel =
config_downloader.GetBool("http-proxy-tunnel", false);
// This is the default amount of time to wait for a TCP/IP connection
// to be established, in seconds. It is presently only used for

View File

@ -45,7 +45,9 @@ extern const bool verify_ssl;
extern const string ssl_cipher_list;
extern const string http_proxy;
extern const string http_direct_hosts;
extern const bool http_try_all_direct;
extern const string http_proxy_username;
extern const bool http_proxy_tunnel;
extern const double connect_timeout;
extern const double http_timeout;
extern const int http_max_connect_count;

View File

@ -241,6 +241,44 @@ get_allow_proxy() const {
return _allow_proxy;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::set_proxy_tunnel
// Access: Published
// Description: Normally, a proxy is itself asked for ordinary URL's,
// and the proxy decides whether to hand the client a
// cached version of the document or to contact the
// server for a fresh version. The proxy may also
// modify the headers and transfer encoding on the way.
//
// If this is set to true, then instead of asking for
// URL's from the proxy, we will ask the proxy to open a
// connection to the server (for instance, on port 80);
// if the proxy honors this request, then we contact the
// server directly through this connection to retrieve
// the document. If the proxy does not honor the
// connect request, then the retrieve operation fails.
//
// SSL connections (e.g. https), and connections through
// a Socks proxy, are always tunneled, regardless of the
// setting of this flag.
////////////////////////////////////////////////////////////////////
INLINE void HTTPChannel::
set_proxy_tunnel(bool proxy_tunnel) {
_proxy_tunnel = proxy_tunnel;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::get_proxy_tunnel
// Access: Published
// Description: Returns true if connections always tunnel through a
// proxy, or false (the normal case) if we allow the
// proxy to serve up documents. See set_proxy_tunnel().
////////////////////////////////////////////////////////////////////
INLINE bool HTTPChannel::
get_proxy_tunnel() const {
return _proxy_tunnel;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::set_connect_timeout
// Access: Published
@ -432,22 +470,36 @@ get_max_updates_per_second() const {
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::get_file_size
// Function: HTTPChannel::set_expected_file_size
// Access: Published
// Description: Returns the size of the file, if it is known.
// Returns 0 if the file size is not known.
//
// If the file is dynamically generated, the size may
// not be available until a read has started
// (e.g. open_read_file() has been called); and even
// then it may increase as more of the file is read due
// to the nature of HTTP/1.1 requests which can change
// their minds midstream about how much data they're
// sending you.
// Description: This may be called immediately after a call to
// get_document() or some related function to specify
// the expected size of the document we are retrieving,
// if we happen to know. This is used as the return
// value to get_file_size() only in the case that the
// server does not tell us the actual file size.
////////////////////////////////////////////////////////////////////
INLINE size_t HTTPChannel::
get_file_size() const {
return _file_size;
INLINE void HTTPChannel::
set_expected_file_size(size_t file_size) {
_expected_file_size = file_size;
_got_expected_file_size = true;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::is_file_size_known
// Access: Published
// Description: Returns true if the size of the file we are currently
// retrieving was told us by the server and thus is
// reliably known, or false if the size reported by
// get_file_size() represents an educated guess
// (possibly as set by set_expected_file_size(), or as
// inferred from a chunked transfer encoding in
// progress).
////////////////////////////////////////////////////////////////////
INLINE bool HTTPChannel::
is_file_size_known() const {
return _got_file_size;
}
////////////////////////////////////////////////////////////////////

View File

@ -44,6 +44,7 @@ HTTPChannel(HTTPClient *client) :
_proxy_next_index = 0;
_persistent_connection = false;
_allow_proxy = true;
_proxy_tunnel = http_proxy_tunnel;
_connect_timeout = connect_timeout;
_http_timeout = http_timeout;
_blocking_connect = false;
@ -55,13 +56,18 @@ HTTPChannel(HTTPClient *client) :
_nonblocking = false;
_want_ssl = false;
_proxy_serves_document = false;
_proxy_tunnel = false;
_proxy_tunnel_now = false;
_first_byte_requested = 0;
_last_byte_requested = 0;
_first_byte_delivered = 0;
_last_byte_delivered = 0;
_read_index = 0;
_expected_file_size = 0;
_file_size = 0;
_transfer_file_size = 0;
_got_expected_file_size = false;
_got_file_size = false;
_got_transfer_file_size = false;
_bytes_downloaded = 0;
_bytes_requested = 0;
_status_code = 0;
@ -179,6 +185,35 @@ get_header_value(const string &key) const {
return string();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::get_file_size
// Access: Published
// Description: Returns the size of the file, if it is known.
// Returns the value set by set_expected_file_size() if
// the file size is not known, or 0 if this value was
// not set.
//
// If the file is dynamically generated, the size may
// not be available until a read has started
// (e.g. open_read_file() has been called); and even
// then it may increase as more of the file is read due
// to the nature of HTTP/1.1 requests which can change
// their minds midstream about how much data they're
// sending you.
////////////////////////////////////////////////////////////////////
size_t HTTPChannel::
get_file_size() const {
if (_got_file_size) {
return _file_size;
} else if (_got_transfer_file_size) {
return _transfer_file_size;
} else if (_got_expected_file_size) {
return _expected_file_size;
} else {
return 0;
}
}
////////////////////////////////////////////////////////////////////
// Function: HTTPChannel::write_headers
// Access: Published
@ -421,17 +456,15 @@ read_body() {
}
string transfer_coding = downcase(get_header_value("Transfer-Encoding"));
string content_length = get_header_value("Content-Length");
ISocketStream *result;
if (transfer_coding == "chunked") {
// "chunked" transfer encoding. This means we will have to decode
// the length of the file as we read it in chunks. The
// IChunkedStream does this.
_file_size = 0;
_state = S_reading_body;
_read_index++;
result = new IChunkedStream(_source, (HTTPChannel *)this);
result = new IChunkedStream(_source, this);
} else {
// If the transfer encoding is anything else, assume "identity".
@ -440,8 +473,7 @@ read_body() {
// specified), or till end of file otherwise.
_state = S_reading_body;
_read_index++;
result = new IIdentityStream(_source, (HTTPChannel *)this,
!content_length.empty(), _file_size);
result = new IIdentityStream(_source, this, _got_file_size, _file_size);
}
return result;
@ -709,7 +741,7 @@ run_connecting() {
<< _bio->get_port() << "\n";
}
if (_proxy_tunnel) {
if (_proxy_tunnel_now) {
if (_proxy.get_scheme() == "socks") {
_state = S_socks_proxy_greet;
} else {
@ -833,7 +865,8 @@ run_http_proxy_request_sent() {
_current_field_name = string();
_current_field_value = string();
_headers.clear();
_file_size = 0;
_got_file_size = false;
_got_transfer_file_size = false;
return false;
}
@ -1355,7 +1388,8 @@ run_request_sent() {
_current_field_name = string();
_current_field_value = string();
_headers.clear();
_file_size = 0;
_got_file_size = false;
_got_transfer_file_size = false;
return false;
}
@ -1451,15 +1485,20 @@ run_reading_header() {
return false;
}
_file_size = 0;
_got_expected_file_size = false;
_got_file_size = false;
_got_transfer_file_size = false;
string content_length = get_header_value("Content-Length");
if (!content_length.empty()) {
_file_size = atoi(content_length.c_str());
_got_file_size = true;
} else if (get_status_code() == 206) {
// Well, we didn't get a content-length from the server, but we
// can infer the number of bytes based on the range we're given.
_file_size = _last_byte_delivered - _first_byte_delivered + 1;
_got_file_size = true;
}
_redirect = get_header_value("Location");
@ -1608,15 +1647,15 @@ run_begin_body() {
// We have already "read" the nonexistent body.
_state = S_read_trailer;
} else if (_file_size > 8192) {
} else if (get_file_size() > 8192) {
// If we know the size of the body we are about to skip and it's
// too large (and here we arbitrarily say 8KB is too large), then
// don't bother skipping it--just drop the connection and get a
// new one.
if (downloader_cat.is_debug()) {
downloader_cat.debug()
<< "Dropping connection rather than skipping past " << _file_size
<< " bytes.\n";
<< "Dropping connection rather than skipping past "
<< get_file_size() << " bytes.\n";
}
reset_to_new();
@ -1920,26 +1959,28 @@ begin_request(HTTPEnum::Method method, const DocumentSpec &url,
////////////////////////////////////////////////////////////////////
void HTTPChannel::
reconsider_proxy() {
_proxy_tunnel = false;
_proxy_tunnel_now = false;
_proxy_serves_document = false;
if (!_proxy.empty()) {
// If we're opening an SSL connection, or the user has explicitly
// If the user insists we always tunnel through a proxy, or if
// we're opening an SSL connection, or the user has explicitly
// asked for a direct connection of some kind, or if we have a
// SOCKS-style proxy; each of these demands a tunnel through the
// proxy to speak directly to the http server.
_proxy_tunnel =
(_want_ssl || _method == HTTPEnum::M_connect || _proxy.get_scheme() == "socks");
_proxy_tunnel_now =
(get_proxy_tunnel() || _want_ssl ||
_method == HTTPEnum::M_connect || _proxy.get_scheme() == "socks");
// Otherwise (but we still have a proxy), then we ask the proxy to
// hand us the document.
_proxy_serves_document = !_proxy_tunnel;
_proxy_serves_document = !_proxy_tunnel_now;
}
make_header();
make_request_text();
if (_proxy_tunnel) {
if (_proxy_tunnel_now) {
// Maybe we need to tunnel through the proxy to connect to the
// server directly.
ostringstream request;

View File

@ -93,6 +93,8 @@ PUBLISHED:
INLINE void set_allow_proxy(bool allow_proxy);
INLINE bool get_allow_proxy() const;
INLINE void set_proxy_tunnel(bool proxy_tunnel);
INLINE bool get_proxy_tunnel() const;
INLINE void set_connect_timeout(double timeout_seconds);
INLINE double get_connect_timeout() const;
@ -111,7 +113,9 @@ PUBLISHED:
INLINE void set_max_updates_per_second(double max_updates_per_second);
INLINE double get_max_updates_per_second() const;
INLINE void set_expected_file_size(size_t file_size);
INLINE size_t get_file_size() const;
INLINE bool is_file_size_known() const;
void write_headers(ostream &out) const;
@ -255,6 +259,7 @@ private:
PT(BioStreamPtr) _source;
bool _persistent_connection;
bool _allow_proxy;
bool _proxy_tunnel;
double _connect_timeout;
double _http_timeout;
bool _blocking_connect;
@ -274,7 +279,7 @@ private:
string _body;
bool _want_ssl;
bool _proxy_serves_document;
bool _proxy_tunnel;
bool _proxy_tunnel_now;
bool _server_response_has_no_body;
size_t _first_byte_requested;
size_t _last_byte_requested;
@ -323,9 +328,14 @@ private:
typedef pmap<string, string> Headers;
Headers _headers;
size_t _expected_file_size;
size_t _file_size;
size_t _transfer_file_size;
size_t _bytes_downloaded;
size_t _bytes_requested;
bool _got_expected_file_size;
bool _got_file_size;
bool _got_transfer_file_size;
// These members are used to maintain the current state while
// communicating with the server. We need to store everything in

View File

@ -17,6 +17,33 @@
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_try_all_direct
// Access: Published
// Description: If this is set true, then after a connection attempt
// through a proxy fails, we always try a direct
// connection, regardless of whether the host is listed
// on the direct_host_spec list. If this is false, a
// direct attempt is not made when we have a proxy in
// effect, even if the proxy fails.
////////////////////////////////////////////////////////////////////
INLINE void HTTPClient::
set_try_all_direct(bool try_all_direct) {
_try_all_direct = try_all_direct;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_try_all_direct
// Access: Published
// Description: Returns whether a failed connection through a proxy
// will be followed up by a direct connection attempt,
// false otherwise.
////////////////////////////////////////////////////////////////////
INLINE bool HTTPClient::
get_try_all_direct() const {
return _try_all_direct;
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_http_version
// Access: Published

View File

@ -102,6 +102,7 @@ HTTPClient() {
set_proxy_spec(http_proxy);
set_direct_host_spec(http_direct_hosts);
_try_all_direct = http_try_all_direct;
if (!http_proxy_username.empty()) {
set_username("*proxy", "", http_proxy_username);
@ -155,6 +156,7 @@ void HTTPClient::
operator = (const HTTPClient &copy) {
_proxies_by_scheme = copy._proxies_by_scheme;
_direct_hosts = copy._direct_hosts;
_try_all_direct = copy._try_all_direct;
_http_version = copy._http_version;
_verify_ssl = copy._verify_ssl;
_usernames = copy._usernames;
@ -190,33 +192,6 @@ HTTPClient::
clear_expected_servers();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_proxy
// Access: Published
// Description: Specifies the proxy URL to handle all http and
// https requests. Deprecated.
////////////////////////////////////////////////////////////////////
void HTTPClient::
set_proxy(const URLSpec &proxy) {
set_proxy_spec(proxy.get_url());
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::get_proxy
// Access: Published
// Description: Returns the proxy URL to handle all http and
// https requests. Deprecated.
////////////////////////////////////////////////////////////////////
URLSpec HTTPClient::
get_proxy() const {
pvector<URLSpec> proxies;
get_proxies_for_url(URLSpec("http://"), proxies);
if (!proxies.empty()) {
return proxies[0];
}
return URLSpec();
}
////////////////////////////////////////////////////////////////////
// Function: HTTPClient::set_proxy_spec
// Access: Published
@ -524,8 +499,10 @@ get_proxies_for_url(const URLSpec &url, pvector<URLSpec> &proxies) const {
}
}
// We always try a direct connection if all else fails.
if (_try_all_direct) {
// We may try a direct connection if all else fails.
temp_list.push_back(URLSpec());
}
// Finally, as a very last resort, fall back to the HTTP proxy.
if (!got_any) {

View File

@ -62,15 +62,15 @@ PUBLISHED:
void operator = (const HTTPClient &copy);
~HTTPClient();
void set_proxy(const URLSpec &proxy);
URLSpec get_proxy() const;
void set_proxy_spec(const string &proxy_spec);
string get_proxy_spec() const;
void set_direct_host_spec(const string &direct_host_spec);
string get_direct_host_spec() const;
INLINE void set_try_all_direct(bool try_all_direct);
INLINE bool get_try_all_direct() const;
void clear_proxy();
void add_proxy(const string &scheme, const URLSpec &proxy);
void clear_direct_host();
@ -141,6 +141,7 @@ private:
ProxiesByScheme _proxies_by_scheme;
typedef pvector<GlobPattern> DirectHosts;
DirectHosts _direct_hosts;
bool _try_all_direct;
HTTPEnum::HTTPVersion _http_version;
VerifySSL _verify_ssl;