Viewing file: http.php (15.85 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php
/*
Plugin Name: Basic HTTP
Description: Check all links that have the HTTP/HTTPS protocol.
Version: 1.0
Author: Janis Elsts
ModuleID: http
ModuleCategory: checker
ModuleContext: on-demand
ModuleLazyInit: true
ModuleClassName: blcHttpChecker
ModulePriority: -1
*/
require_once BLC_DIRECTORY . '/includes/token-bucket.php';
//TODO: Rewrite sub-classes as transports, not stand-alone checkers
class blcHttpChecker extends blcChecker {
/* @var blcChecker */
var $implementation = null;
/** @var blcTokenBucketList */
private $token_bucket_list;
function init(){
parent::init();
$conf = blc_get_configuration();
$this->token_bucket_list = new blcTokenBucketList(
$conf->get('http_throttle_rate', 3),
$conf->get('http_throttle_period', 15),
$conf->get('http_throttle_min_interval', 2)
);
if ( function_exists('curl_init') || is_callable('curl_init') ) {
$this->implementation = new blcCurlHttp(
$this->module_id,
$this->cached_header,
$this->plugin_conf,
$this->module_manager
);
} else {
//Try to load Snoopy.
if ( !class_exists('Snoopy') ){
$snoopy_file = ABSPATH. WPINC . '/class-snoopy.php';
if (file_exists($snoopy_file) ){
include $snoopy_file;
}
}
//If Snoopy is available, it will be used in place of CURL.
if ( class_exists('Snoopy') ){
$this->implementation = new blcSnoopyHttp(
$this->module_id,
$this->cached_header,
$this->plugin_conf,
$this->module_manager
);
}
}
}
function can_check($url, $parsed){
if ( isset($this->implementation) ){
return $this->implementation->can_check($url, $parsed);
} else {
return false;
}
}
function check($url, $use_get = false){
global $blclog;
//Throttle requests based on the domain name.
$domain = @parse_url($url, PHP_URL_HOST);
if ( $domain ) {
$this->token_bucket_list->takeToken($domain);
}
$blclog->debug('HTTP module checking "' . $url . '"');
return $this->implementation->check($url, $use_get);
}
}
/**
* Base class for checkers that deal with HTTP(S) URLs.
*
* @package Broken Link Checker
* @access public
*/
class blcHttpCheckerBase extends blcChecker {
function clean_url($url){
$url = html_entity_decode($url);
$ltrm = preg_quote(json_decode('"\u200E"'), '/');
$url = preg_replace(
array(
'/([\?&]PHPSESSID=\w+)$/i', //remove session ID
'/(#[^\/]*)$/', //and anchors/fragments
'/&/', //convert improper HTML entities
'/([\?&]sid=\w+)$/i', //remove another flavour of session ID
'/' . $ltrm . '/', //remove Left-to-Right marks that can show up when copying from Word.
),
array('', '', '&', '', ''),
$url
);
$url = trim($url);
return $url;
}
function is_error_code($http_code){
/*"Good" response codes are anything in the 2XX range (e.g "200 OK") and redirects - the 3XX range.
HTTP 401 Unauthorized is a special case that is considered OK as well. Other errors - the 4XX range -
are treated as such. */
$good_code = ( ($http_code >= 200) && ($http_code < 400) ) || ( $http_code == 401 );
return !$good_code;
}
/**
* This checker only accepts HTTP(s) links.
*
* @param string $url
* @param array|bool $parsed
* @return bool
*/
function can_check($url, $parsed){
if ( !isset($parsed['scheme']) ) return false;
return in_array( strtolower($parsed['scheme']), array('http', 'https') );
}
/**
* Takes an URL and replaces spaces and some other non-alphanumeric characters with their urlencoded equivalents.
*
* @param string $url
* @return string
*/
function urlencodefix($url){
//TODO: Remove/fix this. Probably not a good idea to "fix" invalid URLs like that.
return preg_replace_callback(
'|[^a-z0-9\+\-\/\\#:.,;=?!&%@()$\|*~_]|i',
create_function('$str','return rawurlencode($str[0]);'),
$url
);
}
}
class blcCurlHttp extends blcHttpCheckerBase {
var $last_headers = '';
function check($url, $use_get = false){
global $blclog;
$blclog->info(__CLASS__ . ' Checking link', $url);
$this->last_headers = '';
$url = $this->clean_url($url);
$blclog->debug(__CLASS__ . ' Clean URL:', $url);
$result = array(
'broken' => false,
'timeout' => false,
'warning' => false,
);
$log = '';
//Get the BLC configuration. It's used below to set the right timeout values and such.
$conf = blc_get_configuration();
//Init curl.
$ch = curl_init();
$request_headers = array();
curl_setopt($ch, CURLOPT_URL, $this->urlencodefix($url));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
//Masquerade as Internet Explorer
$ua = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)';
//$ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko';
curl_setopt($ch, CURLOPT_USERAGENT, $ua);
//Close the connection after the request (disables keep-alive). The plugin rate-limits requests,
//so it's likely we'd overrun the keep-alive timeout anyway.
curl_setopt($ch, CURLOPT_FORBID_REUSE, true);
$request_headers[] = 'Connection: close';
//Add a semi-plausible referer header to avoid tripping up some bot traps
curl_setopt($ch, CURLOPT_REFERER, home_url());
//Redirects don't work when safe mode or open_basedir is enabled.
if ( !blcUtility::is_safe_mode() && !blcUtility::is_open_basedir() ) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
}
//Set maximum redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
//Set the timeout
curl_setopt($ch, CURLOPT_TIMEOUT, $conf->options['timeout']);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conf->options['timeout']);
//Set the proxy configuration. The user can provide this in wp-config.php
if (defined('WP_PROXY_HOST')) {
curl_setopt($ch, CURLOPT_PROXY, WP_PROXY_HOST);
}
if (defined('WP_PROXY_PORT')) {
curl_setopt($ch, CURLOPT_PROXYPORT, WP_PROXY_PORT);
}
if (defined('WP_PROXY_USERNAME')){
$auth = WP_PROXY_USERNAME;
if (defined('WP_PROXY_PASSWORD')){
$auth .= ':' . WP_PROXY_PASSWORD;
}
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $auth);
}
//Make CURL return a valid result even if it gets a 404 or other error.
curl_setopt($ch, CURLOPT_FAILONERROR, false);
$nobody = !$use_get; //Whether to send a HEAD request (the default) or a GET request
$parts = @parse_url($url);
if( $parts['scheme'] == 'https' ){
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //Required to make HTTPS URLs work.
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
//$nobody = false; //Can't use HEAD with HTTPS.
}
if ( $nobody ){
//If possible, use HEAD requests for speed.
curl_setopt($ch, CURLOPT_NOBODY, true);
} else {
//If we must use GET at least limit the amount of downloaded data.
$request_headers[] = 'Range: bytes=0-2048'; //2 KB
}
//Set request headers.
if ( !empty($request_headers) ) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $request_headers);
}
//Register a callback function which will process the HTTP header(s).
//It can be called multiple times if the remote server performs a redirect.
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this,'read_header'));
//Record request headers.
if ( defined('CURLINFO_HEADER_OUT') ) {
curl_setopt($ch, CURLINFO_HEADER_OUT, true);
}
//Execute the request
$start_time = microtime_float();
$content = curl_exec($ch);
$measured_request_duration = microtime_float() - $start_time;
$blclog->debug(sprintf('HTTP request took %.3f seconds', $measured_request_duration));
$info = curl_getinfo($ch);
//Store the results
$result['http_code'] = intval( $info['http_code'] );
$result['final_url'] = $info['url'];
$result['request_duration'] = $info['total_time'];
$result['redirect_count'] = $info['redirect_count'];
//CURL doesn't return a request duration when a timeout happens, so we measure it ourselves.
//It is useful to see how long the plugin waited for the server to respond before assuming it timed out.
if( empty($result['request_duration']) ){
$result['request_duration'] = $measured_request_duration;
}
//Determine if the link counts as "broken"
if ( $result['http_code'] == 0 ){
$result['broken'] = true;
$error_code = curl_errno($ch);
$log .= sprintf( "%s [Error #%d]\n", curl_error($ch), $error_code );
//We only handle a couple of CURL error codes; most are highly esoteric.
//libcurl "CURLE_" constants can't be used here because some of them have
//different names or values in PHP.
switch( $error_code ) {
case 6: //CURLE_COULDNT_RESOLVE_HOST
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __('Server Not Found', 'broken-link-checker');
$result['error_code'] = 'couldnt_resolve_host';
break;
case 28: //CURLE_OPERATION_TIMEDOUT
$result['timeout'] = true;
break;
case 7: //CURLE_COULDNT_CONNECT
//More often than not, this error code indicates that the connection attempt
//timed out. This heuristic tries to distinguish between connections that fail
//due to timeouts and those that fail due to other causes.
if ( $result['request_duration'] >= 0.9*$conf->options['timeout'] ){
$result['timeout'] = true;
} else {
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __('Connection Failed', 'broken-link-checker');
$result['error_code'] = 'connection_failed';
}
break;
default:
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __('Unknown Error', 'broken-link-checker');
}
} else {
$result['broken'] = $this->is_error_code($result['http_code']);
}
curl_close($ch);
$blclog->info(sprintf(
'HTTP response: %d, duration: %.2f seconds, status text: "%s"',
$result['http_code'],
$result['request_duration'],
isset($result['status_text']) ? $result['status_text'] : 'N/A'
));
if ( $nobody && $result['broken'] ){
//The site in question might be expecting GET instead of HEAD, so lets retry the request
//using the GET verb.
return $this->check($url, true);
//Note : normally a server that doesn't allow HEAD requests on a specific resource *should*
//return "405 Method Not Allowed". Unfortunately, there are sites that return 404 or
//another, even more general, error code instead. So just checking for 405 wouldn't be enough.
}
//When safe_mode or open_basedir is enabled CURL will be forbidden from following redirects,
//so redirect_count will be 0 for all URLs. As a workaround, set it to 1 when the HTTP
//response codes indicates a redirect but redirect_count is zero.
//Note to self : Extracting the Location header might also be helpful.
if ( ($result['redirect_count'] == 0) && ( in_array( $result['http_code'], array(301, 302, 303, 307) ) ) ){
$result['redirect_count'] = 1;
}
//Build the log from HTTP code and headers.
$log .= '=== ';
if ( $result['http_code'] ){
$log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
} else {
$log .= __('(No response)', 'broken-link-checker');
}
$log .= " ===\n\n";
$log .= "Response headers\n" . str_repeat('=', 16) . "\n";
$log .= htmlentities($this->last_headers);
if ( isset($info['request_header']) ) {
$log .= "Request headers\n" . str_repeat('=', 16) . "\n";
$log .= htmlentities($info['request_header']);
}
if ( !$nobody && ($content !== false) && $result['broken'] ) {
$log .= "Response HTML\n" . str_repeat('=', 16) . "\n";
$log .= htmlentities(substr($content, 0, 2048));
}
if ( !empty($result['broken']) && !empty($result['timeout']) ) {
$log .= "\n(" . __("Most likely the connection timed out or the domain doesn't exist.", 'broken-link-checker') . ')';
}
$result['log'] = $log;
//The hash should contain info about all pieces of data that pertain to determining if the
//link is working.
$result['result_hash'] = implode('|', array(
$result['http_code'],
!empty($result['broken'])?'broken':'0',
!empty($result['timeout'])?'timeout':'0',
blcLink::remove_query_string($result['final_url']),
));
return $result;
}
function read_header(/** @noinspection PhpUnusedParameterInspection */ $ch, $header){
$this->last_headers .= $header;
return strlen($header);
}
}
class blcSnoopyHttp extends blcHttpCheckerBase {
function check($url){
$url = $this->clean_url($url);
//Note : Snoopy doesn't work too well with HTTPS URLs.
$result = array(
'broken' => false,
'timeout' => false,
);
$log = '';
//Get the timeout setting from the BLC configuration.
$conf = blc_get_configuration();
$timeout = $conf->options['timeout'];
$start_time = microtime_float();
//Fetch the URL with Snoopy
$snoopy = new Snoopy;
$snoopy->read_timeout = $timeout; //read timeout in seconds
$snoopy->agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; //masquerade as IE 7
$snoopy->referer = home_url(); //valid referer helps circumvent some hotlink protection schemes
$snoopy->maxlength = 1024*5; //load up to 5 kilobytes
$snoopy->fetch( $this->urlencodefix($url) );
$result['request_duration'] = microtime_float() - $start_time;
$result['http_code'] = $snoopy->status; //HTTP status code
//Snoopy returns -100 on timeout
if ( $result['http_code'] == -100 ){
$result['http_code'] = 0;
$result['timeout'] = true;
}
//Build the log
$log .= '=== ';
if ( $result['http_code'] ){
$log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
} else {
$log .= __('(No response)', 'broken-link-checker');
}
$log .= " ===\n\n";
if ($snoopy->error)
$log .= $snoopy->error."\n";
if ($snoopy->timed_out) {
$log .= __("Request timed out.", 'broken-link-checker') . "\n";
$result['timeout'] = true;
}
if ( is_array($snoopy->headers) )
$log .= implode("", $snoopy->headers)."\n"; //those headers already contain newlines
//Redirected?
if ( $snoopy->lastredirectaddr ) {
$result['final_url'] = $snoopy->lastredirectaddr;
$result['redirect_count'] = $snoopy->_redirectdepth;
} else {
$result['final_url'] = $url;
}
//Determine if the link counts as "broken"
$result['broken'] = $this->is_error_code($result['http_code']) || $result['timeout'];
$log .= "<em>(" . __('Using Snoopy', 'broken-link-checker') . ")</em>";
$result['log'] = $log;
//The hash should contain info about all pieces of data that pertain to determining if the
//link is working.
$result['result_hash'] = implode('|', array(
$result['http_code'],
$result['broken']?'broken':'0',
$result['timeout']?'timeout':'0',
blcLink::remove_query_string($result['final_url']),
));
return $result;
}
}
|