Whether operating a commercial site, a directory, or a personal site, it is important to ensure you do not have ‘dead’ links on your website. Broken links; links that point to inactive domains or 404 pages are of little use to your site visitors and may jeapordise any good search engine rankings you have, as it can be inferred your site is not well maintained while having broken links on it.
To remedy any potential problem, using a script to periodically check links on your pages means you can quickly alter & remove links that are no longer active or useful.
The following script will Pagination do this task for you, using PHP and cURL, with a simple HTML parser to find links on a page. Simply enter a URL into the form, and the results will appear on an IFrame in the same page.
class html_parser {
// A function to convert relative links to absolute links
public function rel2abs($rel,$base) {
@$p = parse_url($rel);
if(!$rel)
return $base;
if(isset($p[‘scheme’]) && $p[‘scheme’]) {
if(!isset($p[‘path’])) {
if(isset($p[‘query’]))
$rel = preg_replace(“‘\?'”,‘/?’,$rel,1);
else
$rel .= ‘/’;
}
return $rel; /* return if already absolute URL */
}
if($rel[0]==‘#’ || $rel[0]==‘?’)
return $base.$rel; /* queries and anchors */
extract(parse_url($base)); /* parse base URL and convert to local variables:$scheme, $host, $path */
$path = preg_replace(‘#/[^/]*$#’, ”, $path); /* remove non-directory element from path */
if ($rel[0] == ‘/’)
$path = ”; /* destroy path if relative url points to root */
$abs = “$host$path/$rel”; /* dirty absolute URL */
$re = array(‘#(/.?/)#’, ‘#/(?!..)[^/]+/../#’); /* replace ‘//’ or ‘/./’ or ‘/foo/../’ with ‘/’ */
for($n=1; $n>0; $abs=preg_replace($re, ‘/’, $abs, –1, $n))
;
return $scheme.‘://’.$abs; /* absolute URL is ready! */
}
// DOM functions used to find URLs
function parse_for_links($dom,$url,$tag,$attr,&$i) {
foreach($dom->getElementsByTagName($tag) as $link) {
$href = $link->getAttribute($attr);
if(!strlen($href) || $href[0] == ‘#’ || preg_match(“‘^javascript’i”,$href))
continue;
$href = preg_replace(array(“‘^[^:]+://'”,“‘#.+$'”),”,$this->rel2abs($href,$url));
if(isset($done[$href]))
continue;
$anchor = $link->nodeValue;
$string = ‘curl -I -A “Broken Link Checker” -s –max-redirs 5 -m 5 –retry 1 –retry-delay 10 -w “%{url_effective}\t%{http_code}\t%{time_total}” -o temp2.txt ‘.escapeshellarg($href);
$string = explode(“\t”,$string);
if($string[1][0] == ‘2’)
$color = ‘green’;
elseif($string[1][0] == ‘3’)
$color = ‘yellow’;
else
$color = ‘red’;
echo (++$i).‘. <font color=”‘.$color.‘”>’.$string[1].‘</font> ‘.$string[2].‘ ‘.str_pad($string[0],50,‘ ‘,STR_PAD_RIGHT).“\n”;
$done[$href] = true;
if($i > 100) // Limiting to 100 URLs, you can change this to suit your needs.
break;
flush();
}
}
}
// Loads up an Iframe with some default text
if(isset($_GET[‘iframe’])) {
echo ‘Results will appear here’;
exit(0);
}
// You have submitted a URL to check
if(isset($_POST[‘url’],$_POST[‘choice’])) {
@$url = parse_url($_POST[‘url’]);
if(!isset($url[‘host’]))
echo ‘The URL you provided was invalid, please submit a valid URL’;
else {
// Prepare the command to send to cURL (on the command line)
$string = ‘curl -A “Broken Link Checker” -s –max-redirs 5 -m 5 –retry 1 –retry-delay 10 -w “%{url_effective}\t%{http_code}\t%{size_download}\t%{time_total}” -o temp.txt ‘.escapeshellarg($_POST[‘url’]);
// Check the HTTP response type
$string = explode(“\t”,$string);
if($string[1][0] == ‘2’)
$color = ‘green’;
elseif($string[1][0] == ‘3’)
$color = ‘yellow’;
else
$color = ‘red’;
echo ‘<sup>Fetched ‘.$string[0].‘ (‘.$string[2].‘ bytes) in ‘.$string[3].‘ seconds, it returned a <font color=”‘.$color.‘”>’.$string[1].‘</font> response’;
echo ‘<pre><br />’;
$_html_parser = new html_parser;
$dom = new DOMDocument;
@$dom->loadHTML(file_get_contents(‘temp.txt’));
$i = 0;
if($_POST[‘choice’] == ‘Check Links’) { // Checking <a> and <area> references
$_html_parser->parse_for_links($dom,$_POST[‘url’],‘a’,‘href’,$i);
$_html_parser->parse_for_links($dom,$_POST[‘url’],‘area’,‘href’,$i);
}
elseif($_POST[‘choice’] == ‘Check Files’) { // Checking <link>, <script> and <img> references
$_html_parser->parse_for_links($dom,$_POST[‘url’],‘link’,‘href’,$i);
$_html_parser->parse_for_links($dom,$_POST[‘url’],‘script’,‘src’,$i);
$_html_parser->parse_for_links($dom,$_POST[‘url’],‘img’,‘src’,$i);
}
is_file(‘temp.txt’) && unlink(‘temp.txt’);
is_file(‘temp2.txt’) && unlink(‘temp2.txt’);
}
exit(0);
}
// Introductory text
echo ‘<p>Use the tool below to see if there are any broken links on your site.</p>
<p>The URL you provide will be fetched and then parsed for links and images. Each of those will then be fetched with a 5 second limit, which is ample time. Any requests that aren\’t made in that time should be classed as broken or currently unavailable. A typical request usually takes a quarter of a second. A maximum of 100 links/files will be checked</p>
<p><font color=”green”>Green</font> indicates a healthy link, <font color=”yellow”>yellow</font> indicates a redirect, which may or may not lead to a healthy page. Finally, <font color=”red”>red</font> indicates a broken link, by either pointing to a defunct page or to a server that is unresponsive.</p>
<p>If the URL you submit is a redirect and returns one URL, try using that URL instead.</p>
<form method=”post” target=”iframe”>
<p>Enter a URL: <input type=”text” name=”url” value=”” style=”width:50%” />
<select name=”choice”><option>Check Links</option><option>Check Files</option></select>
<input type=”submit” value=”Check It!” />
</p></form>
<div align=”center”><iframe id=”iframe” name=”iframe” style=”width:95%” src=”?iframe=1″></iframe></div>
<p>Please bear in mind that it is possible that different users receive different responses from the links you see above. The results you see should be indicative only and second checked manually should further investigation be required.</p>’;