Dealing with unqualified HREF values (Part 2)

    James Edwards
    Share

    In my original blog post, Dealing with unqualified HREF values, I put forward a method for converting an href value in any format into a fully-qualified URL, using data from the document location object.

    However, as one commentator pointed out, the proposed solution couldn’t cater for changes in location context, such as that caused by a <base> element, or within the context of an included document such as a stylesheet or a page in an <iframe>.

    To fix that it was necessary to abandon the use of the location object entirely, and parse URLs as strings. But the upshot of this is a far more flexible and useful method, that automatically caters for <base> elements, and is also able to accept an input location string to use as context.

    So from the original code, this:

    //get the current document location object
    var loc = document.location;

    Becomes this:

    //get the current document location href
    var here = document.location.href;
    
    //look for a base element to use instead
    var bases = document.getElementsByTagName('base');
    if(bases.length > 0)
    {
    	var basehref = bases[0].getAttribute('href');
    	if(basehref && basehref != '')
    	{
    		here = basehref;
    	}
    }
    
    //if the context argument is present and non-empty string, use that instead
    if(typeof context == 'string' && context != '')
    {
    	here = context;
    }
    
    //extract the protocol, host and path
    //and create a location object with the data
    var parts = here.replace('//', '/').split('/');
    var loc = {
    	'protocol' : parts[0],
    	'host' : parts[1]
    	}
    parts.splice(0, 2);
    loc.pathname = '/' + parts.join('/');

    Here’s the complete revised function:

    //qualify an HREF to form a complete URI
    function qualifyHREF(href, context)
    {
    	//get the current document location href
    	var here = document.location.href;
    
    	//look for a base element to use instead
    	var bases = document.getElementsByTagName('base');
    	if(bases.length > 0)
    	{
    		var basehref = bases[0].getAttribute('href');
    		if(basehref && basehref != '')
    		{
    			here = basehref;
    		}
    	}
    
    	//if the context argument is present and non-empty string, use that instead
    	if(typeof context == 'string' && context != '')
    	{
    		here = context;
    	}
    
    	//extract the protocol, host and path
    	//and create a location object with the data
    	var parts = here.replace('//', '/').split('/');
    	var loc = {
    		'protocol' : parts[0],
    		'host' : parts[1]
    		}
    	parts.splice(0, 2);
    	loc.pathname = '/' + parts.join('/');
    
    	//build a base URI from the protocol plus host (which includes port if applicable)
    	var uri = loc.protocol + '//' + loc.host;
    
    	//if the input path is relative-from-here
    	//just delete the ./ token to make it relative
    	if(/^(./)([^/]?)/.test(href))
    	{
    		href = href.replace(/^(./)([^/]?)/, '$2');
    	}
    
    	//if the input href is already qualified, copy it unchanged
    	if(/^([a-z]+):///.test(href))
    	{
    		uri = href;
    	}
    
    	//or if the input href begins with a leading slash, then it's base relative
    	//so just add the input href to the base URI
    	else if(href.substr(0, 1) == '/')
    	{
    		uri += href;
    	}
    
    	//or if it's an up-reference we need to compute the path
    	else if(/^((../)+)([^/].*$)/.test(href))
    	{
    		//get the last part of the path, minus up-references
    		var lastpath = href.match(/^((../)+)([^/].*$)/);
    		lastpath = lastpath[lastpath.length - 1];
    
    		//count the number of up-references
    		var references = href.split('../').length - 1;
    
    		//get the path parts and delete the last one (this page or directory)
    		var parts = loc.pathname.split('/');
    		parts = parts.splice(0, parts.length - 1);
    
    		//for each of the up-references, delete the last part of the path
    		for(var i=0; i<references; i++)
    		{
    			parts = parts.splice(0, parts.length - 1);
    		}
    
    		//now rebuild the path
    		var path = '';
    		for(i=0; i<parts.length; i++)
    		{
    			if(parts[i] != '')
    			{
    				path += '/' + parts[i];
    			}
    		}
    		path += '/';
    
    		//and add the last part of the path
    		path += lastpath;
    
    		//then add the path and input href to the base URI
    		uri += path;
    	}
    
    	//otherwise it's a relative path,
    	else
    	{
    		//calculate the path to this directory
    		path = '';
    		parts = loc.pathname.split('/');
    		parts = parts.splice(0, parts.length - 1);
    		for(var i=0; i<parts.length; i++)
    		{
    			if(parts[i] != '')
    			{
    				path += '/' + parts[i];
    			}
    		}
    		path += '/';
    
    		//then add the path and input href to the base URI
    		uri += path + href;
    	}
    
    	//return the final uri
    	return uri;
    }

    But wait … there’s more!

    Having done that, I realised I was only a hop and a skip away from implementing a JavaScript equivalent of PHP’s parse_url method:

    //parse a URL to form an object of properties
    function parseURL(url)
    {
    	//save the unmodified url to href property
    	//so that the object we get back contains
    	//all the same properties as the built-in location object
    	var loc = { 'href' : url };
    
    	//split the URL by single-slashes to get the component parts
    	var parts = url.replace('//', '/').split('/');
    
    	//store the protocol and host
    	loc.protocol = parts[0];
    	loc.host = parts[1];
    
    	//extract any port number from the host
    	//from which we derive the port and hostname
    	parts[1] = parts[1].split(':');
    	loc.hostname = parts[1][0];
    	loc.port = parts[1].length > 1 ? parts[1][1] : '';
    
    	//splice and join the remainder to get the pathname
    	parts.splice(0, 2);
    	loc.pathname = '/' + parts.join('/');
    
    	//extract any hash and remove from the pathname
    	loc.pathname = loc.pathname.split('#');
    	loc.hash = loc.pathname.length > 1 ? '#' + loc.pathname[1] : '';
    	loc.pathname = loc.pathname[0];
    
    	//extract any search query and remove from the pathname
    	loc.pathname = loc.pathname.split('?');
    	loc.search = loc.pathname.length > 1 ? '?' + loc.pathname[1] : '';
    	loc.pathname = loc.pathname[0];
    
    	//return the final object
    	return loc;
    }

    So that’s two more for the toolkit!