Dealing with unqualified HREF values (Part 2)

In my original blog post, Dealing with unqualified HREF values, I put forward a method for converting an href value in any format into a fully-qualified URL, using data from the document location object.

However, as one commentator pointed out, the proposed solution couldn’t cater for changes in location context, such as that caused by a <base> element, or within the context of an included document such as a stylesheet or a page in an <iframe>.

To fix that it was necessary to abandon the use of the location object entirely, and parse URLs as strings. But the upshot of this is a far more flexible and useful method, that automatically caters for <base> elements, and is also able to accept an input location string to use as context.

So from the original code, this:

//get the current document location object
var loc = document.location;

Becomes this:

//get the current document location href
var here = document.location.href;

//look for a base element to use instead
var bases = document.getElementsByTagName('base');
if(bases.length > 0)
{
	var basehref = bases[0].getAttribute('href');
	if(basehref && basehref != '')
	{
		here = basehref;
	}
}

//if the context argument is present and non-empty string, use that instead
if(typeof context == 'string' && context != '')
{
	here = context;
}

//extract the protocol, host and path
//and create a location object with the data
var parts = here.replace('//', '/').split('/');
var loc = {
	'protocol' : parts[0],
	'host' : parts[1]
	}
parts.splice(0, 2);
loc.pathname = '/' + parts.join('/');

Here’s the complete revised function:

//qualify an HREF to form a complete URI
function qualifyHREF(href, context)
{
	//get the current document location href
	var here = document.location.href;

	//look for a base element to use instead
	var bases = document.getElementsByTagName('base');
	if(bases.length > 0)
	{
		var basehref = bases[0].getAttribute('href');
		if(basehref && basehref != '')
		{
			here = basehref;
		}
	}

	//if the context argument is present and non-empty string, use that instead
	if(typeof context == 'string' && context != '')
	{
		here = context;
	}

	//extract the protocol, host and path
	//and create a location object with the data
	var parts = here.replace('//', '/').split('/');
	var loc = {
		'protocol' : parts[0],
		'host' : parts[1]
		}
	parts.splice(0, 2);
	loc.pathname = '/' + parts.join('/');

	//build a base URI from the protocol plus host (which includes port if applicable)
	var uri = loc.protocol + '//' + loc.host;

	//if the input path is relative-from-here
	//just delete the ./ token to make it relative
	if(/^(./)([^/]?)/.test(href))
	{
		href = href.replace(/^(./)([^/]?)/, '$2');
	}

	//if the input href is already qualified, copy it unchanged
	if(/^([a-z]+):///.test(href))
	{
		uri = href;
	}

	//or if the input href begins with a leading slash, then it's base relative
	//so just add the input href to the base URI
	else if(href.substr(0, 1) == '/')
	{
		uri += href;
	}

	//or if it's an up-reference we need to compute the path
	else if(/^((../)+)([^/].*$)/.test(href))
	{
		//get the last part of the path, minus up-references
		var lastpath = href.match(/^((../)+)([^/].*$)/);
		lastpath = lastpath[lastpath.length - 1];

		//count the number of up-references
		var references = href.split('../').length - 1;

		//get the path parts and delete the last one (this page or directory)
		var parts = loc.pathname.split('/');
		parts = parts.splice(0, parts.length - 1);

		//for each of the up-references, delete the last part of the path
		for(var i=0; i<references; i++)
		{
			parts = parts.splice(0, parts.length - 1);
		}

		//now rebuild the path
		var path = '';
		for(i=0; i<parts.length; i++)
		{
			if(parts[i] != '')
			{
				path += '/' + parts[i];
			}
		}
		path += '/';

		//and add the last part of the path
		path += lastpath;

		//then add the path and input href to the base URI
		uri += path;
	}

	//otherwise it's a relative path,
	else
	{
		//calculate the path to this directory
		path = '';
		parts = loc.pathname.split('/');
		parts = parts.splice(0, parts.length - 1);
		for(var i=0; i<parts.length; i++)
		{
			if(parts[i] != '')
			{
				path += '/' + parts[i];
			}
		}
		path += '/';

		//then add the path and input href to the base URI
		uri += path + href;
	}

	//return the final uri
	return uri;
}

But wait … there’s more!

Having done that, I realised I was only a hop and a skip away from implementing a JavaScript equivalent of PHP’s parse_url method:

//parse a URL to form an object of properties
function parseURL(url)
{
	//save the unmodified url to href property
	//so that the object we get back contains
	//all the same properties as the built-in location object
	var loc = { 'href' : url };

	//split the URL by single-slashes to get the component parts
	var parts = url.replace('//', '/').split('/');

	//store the protocol and host
	loc.protocol = parts[0];
	loc.host = parts[1];

	//extract any port number from the host
	//from which we derive the port and hostname
	parts[1] = parts[1].split(':');
	loc.hostname = parts[1][0];
	loc.port = parts[1].length > 1 ? parts[1][1] : '';

	//splice and join the remainder to get the pathname
	parts.splice(0, 2);
	loc.pathname = '/' + parts.join('/');

	//extract any hash and remove from the pathname
	loc.pathname = loc.pathname.split('#');
	loc.hash = loc.pathname.length > 1 ? '#' + loc.pathname[1] : '';
	loc.pathname = loc.pathname[0];

	//extract any search query and remove from the pathname
	loc.pathname = loc.pathname.split('?');
	loc.search = loc.pathname.length > 1 ? '?' + loc.pathname[1] : '';
	loc.pathname = loc.pathname[0];

	//return the final object
	return loc;
}

So that’s two more for the toolkit!

Free book: Jump Start HTML5 Basics

Grab a free copy of one our latest ebooks! Packed with hints and tips on HTML5's most powerful new features.

  • Julien Royer

    I think there is a much more easier solution:

    function qualifyHREF(href) {
      var img = document.createElement("img");
      img.src = url;
      return img.src;
    }
  • logic_earth

    I don’t understand Julien, how does that snippet make a qualified URI?

  • Julien Royer

    @logic_earth: the src attribute is a complex attribute (a setter/getter couple), which does as far as I know consistently return the qualified URL in the current browsers (FF, IE, Opera, Safari).

  • http://www.brothercake.com/ brothercake

    Yup that’s a pretty neat hack, and in most cases works perfectly well.

    My major concern with it is that every time it’s used it makes a server request, which adds to the latency and process overhead of an application that uses it, as well as making it unsuitable for use in abstraction (where you don’t have access to a document object). It will also fail if the href is an empty string (returning an empty string, instead of a reference to the parent folder).

    It’s the server request thing that would make me not want to use it; but still, it’s a nice solution :)

  • Julien Royer

    @brothercake: thank you for your answer. I thought the browser would wait for the image to be added to the document tree before making the server request, but you seem to be right.

    I don’t understand why browsers don’t offer the possibility to natively qualify an URL.

  • pbfranklin.mtech@gmail.com

    could please any one tell how to get the href value from page in script function in the same page….