/* 
    www.google.com => google.com
    subdomain.domain.com.au => domain.com.au
    blah.jp => blah.jp
See my post here.
*/
// TODO: this list should be updated from time to time, automatically.
// taken from http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains

var gTopLevelDomainDict = 
{
	"ac":1,"ad":1,"ae":1,"aero":1,"af":1,"ag":1,"ai":1,"al":1,"am":1,
	"an":1,"ao":1,"aq":1,"ar":1,"arpa":1,"as":1,"at":1,"au":1,"aw":1,
	"az":1,"ba":1,"bb":1,"bd":1,"be":1,"bf":1,"bg":1,"bh":1,"bi":1,
	"biz":1,"bj":1,"bm":1,"bn":1,"bo":1,"br":1,"bs":1,"bt":1,"bv":1,
	"bw":1,"by":1,"bz":1,"ca":1,"cc":1,"cd":1,"cf":1,"cg":1,"ch":1,
	"ci":1,"ck":1,"cl":1,"cm":1,"cn":1,"co":1,"com":1,"coop":1,"cr":1,
	"cu":1,"cv":1,"cx":1,"cy":1,"cz":1,"de":1,"dj":1,"dk":1,"dm":1,
	"do":1,"dz":1,"ec":1,"edu":1,"ee":1,"eg":1,"er":1,"es":1,"et":1,
	"fi":1,"fj":1,"fk":1,"fm":1,"fo":1,"fr":1,"ga":1,"gb":1,"gd":1,
	"ge":1,"gf":1,"gg":1,"gh":1,"gi":1,"gl":1,"gm":1,"gn":1,"gov":1,
	"gp":1,"gq":1,"gr":1,"gs":1,"gt":1,"gu":1,"gw":1,"gy":1,"hk":1,
	"hm":1,"hn":1,"hr":1,"ht":1,"hu":1,"id":1,"ie":1,"il":1,"im":1,
	"in":1,"info":1,"int":1,"io":1,"iq":1,"ir":1,"is":1,"it":1,"je":1,
	"jm":1,"jo":1,"jp":1,"ke":1,"kg":1,"kh":1,"ki":1,"km":1,"kn":1,
	"kr":1,"kw":1,"ky":1,"kz":1,"la":1,"lb":1,"lc":1,"li":1,"lk":1,
	"lr":1,"ls":1,"lt":1,"lu":1,"lv":1,"ly":1,"ma":1,"mc":1,"md":1,
	"mg":1,"mh":1,"mil":1,"mk":1,"ml":1,"mm":1,"mn":1,"mo":1,"mp":1,
	"mq":1,"mr":1,"ms":1,"mt":1,"mu":1,"museum":1,"mv":1,"mw":1,"mx":1,
	"my":1,"mz":1,"na":1,"name":1,"nc":1,"ne":1,"net":1,"nf":1,"ng":1,
	"ni":1,"nl":1,"no":1,"np":1,"nr":1,"nu":1,"nz":1,"om":1,"org":1,
	"pa":1,"pe":1,"pf":1,"pg":1,"ph":1,"pk":1,"pl":1,"pm":1,"pn":1,
	"pr":1,"pro":1,"ps":1,"pt":1,"pw":1,"py":1,"qa":1,"re":1,"ro":1,
	"ru":1,"rw":1,"sa":1,"sb":1,"sc":1,"sd":1,"se":1,"sg":1,"sh":1,
	"si":1,"sj":1,"sk":1,"sl":1,"sm":1,"sn":1,"so":1,"sr":1,"st":1,
	"su":1,"sv":1,"sy":1,"sz":1,"tc":1,"td":1,"tf":1,"tg":1,"th":1,
	"tj":1,"tk":1,"tm":1,"tn":1,"to":1,"tp":1,"tr":1,"tt":1,"tv":1,
	"tw":1,"tz":1,"ua":1,"ug":1,"uk":1,"um":1,"us":1,"uy":1,"uz":1,
	"va":1,"vc":1,"ve":1,"vg":1,"vi":1,"vn":1,"vu":1,"wf":1,"ws":1,
	"ye":1,"yt":1,"yu":1,"za":1,"zm":1,"zw":1 
};
// break authority into two parts: subdomain(s), and base authority
// e.g. images.google.com --> [images, google.com]
//      www.popo.com.au --> [www, popo.com.au]
function splitAuthority(aAuthority)
{
	// walk down from right, stop at (but include) first non-toplevel domain
	chunks = aAuthority.split(/\./).reverse();
	var baseAuthority="";
	var subDomain="";
	var i=0;
	var foundBreak = false;
	for (i in chunks)
	{
		if (!foundBreak) 
			baseAuthority = chunks[i] + (baseAuthority ? "." : "") + baseAuthority;
		else
			subDomain = chunks[i] + (subDomain ? "." : "") + subDomain
		if (gTopLevelDomainDict[chunks[i]] != 1) foundBreak=true;
	}
	return ([subDomain,baseAuthority])
}

// URI object to be returned
function URI(scheme,authority,subDomain,baseAuthority,path,query,fragment)
{
	this.scheme = scheme;
	this.subDomain = subDomain;
	this.baseAuthority = baseAuthority;
	this.authority = authority;
	this.path = path;
	this.query = query;
	this.fragment = fragment;
}
// function to split URI into its parts, returned as URI object
function decomposeURI(aURI)
{
	// Javascript doesn't like a question mark as the first item 
	// in a regex grouping. Why is that?
	// TODO: handle port #'s, query
	var uriDef =   "^(([^:/?#]+):)?(//([^/?#]*))?([^#]*)(#(.*))?"
	//              012            4  5          6      7 8
	var myRegEp = new RegExp(uriDef,"g")
	var m = myRegEp.exec(aURI);
	if (!m) return false;

	var scheme = m[2] ? m[2] : "";
	var authority = m[4] ? m[4] : ""; 
	var path = m[5] ? m[5] : "";
	var query = "";
	var fragment = m[8] ? m[8] : "";
	var s = splitAuthority(authority);
	var subDomain = s[0];
	var baseAuthority = s[1];
	
	return(new URI(scheme,authority,subDomain,baseAuthority,path,query,fragment))
}

// this function does a best-guess of what is important in a URI
// rules: if a specific file path is mentioned, we take exact URI
// else ...
// forceGeneralize is a boolean, which if true, means we should
// generalize even if there is a specific path mentioned in the URI.
function generalizeURI(aURI, forceGeneralize)
{
	var uriParts = decomposeURI(aURI) ;

	// only mess with http,https,and ftp schemes
	if (!((uriParts.scheme == "http") ||
				(uriParts.scheme == "https") ||
				(uriParts.scheme == "ftp")))
	{
		return (aURI)	
	}
	
	// generalize to base authority only if a specific page wasn't mentioned.
	if (forceGeneralize || 
	    ((uriParts.path=="") || (uriParts.path.lastIndexOf('/')==0)))
	{
		if ((uriParts.subDomain=="") || (uriParts.subDomain=="www"))
		{
			// e.g. google.com
			return (uriParts.baseAuthority)
		}
		else
		{
			// e.g. images.google.com
			return (uriParts.authority);
		}
		
	}
	else
	{
		// e.g. www.wanderingstan.com/projects/lsa.html
		return (aURI)
	}
}