/* @(#)url.c 1.13 95/09/16 */

/*
 * Copyright (c) 1994, 1995 by Wayne C. Gramlich.  All rights reserved.
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * for any purpose is hereby granted without fee provided that the above
 * copyright notice and this permission are retained.  The author makes
 * no representations about the suitability of this software for any purpose.
 * It is provided "as is" without express or implied warranty.
 */

/* LINTLIBRARY */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "pwd.h"
#include "config_extern.h"
#include "memory_extern.h"
#include "str_extern.h"
#include "url_extern.h"

#define LETTERS "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
#define DIGITS "0123456789"
#define PROTOCOL_CHARACTERS (Str)(LETTERS DIGITS "_-")
#define HOST_NAME_CHARACTERS (Str)(LETTERS DIGITS "._-")
#define PORT_CHARACTERS (Str)(DIGITS)

struct url_struct {
    Str protocol_name;		/* Protocol name (or "") */
    Str protocol_colon;		/* Colon following protocol (or "") */
    Str host_slash_slash;	/* "//" or "" */
    Str host_name;		/* Host name (or "") */
    Str port_colon;		/* Colon preceeding port number (or "") */
    Str port_number;		/* Port number (or "") */
    Str path_head_slash;	/* Preceeding slash for path (or "") */
    Str path_twiddle;		/* Twiddle character for user name (or "") */
    Str path_user;		/* User name (or "") */
    Str path_user_slash;	/* Slash following user name (or "") */
    Str path;			/* Path preceeding document name (or "") */
    Str path_tail_slash;	/* Slash following path (or "") */
    Str document;		/* Name of document upto last period (or "") */
    Str suffix_period;		/* Period preceeding suffix (or "") */
    Str suffix_name;		/* Suffix name (or "") */
    Str anchor_query_prefix;	/* Anchor/query charactor ("#", "?", or "") */
    Str anchor_query;		/* The anchor or query portion (or "") */
};    

static Url url_create(void);
static int url_is_valid(Url);

/*
 * url_anchor(url)
 *	This routine will return the anchor portion of {url} or {(Str)""}
 *	there is none.
 */
Str
url_anchor(
    Url url)
{
    if (str_equal(url->anchor_query_prefix, (Str)"#")) {
	return url->anchor_query;
    }
    return (Str)"";
}

/*
 * url_annote(url)
 *	This routine will return the {Url} corresponding to {url}'s
 *	annotation URL.
 */
Url
url_annote(
    Url url,
    Config config)
{
    Str old_path;
    Str new_path;

    url = url_copy(url);
    old_path = url->path;
    new_path = str_printf("Annote/%s", old_path);
    str_free(old_path);
    url->path = new_path;
    return url;
}

/*
 * url_create()
 *	This routine will create and return an empty {Url}.
 */
static Url
url_create(void)
{
    Url url;

    url = (Url)memory_allocate_zeroed(sizeof *url);
    return url;
}

/*
 * url_copy(url)
 *	This routine will return a copy of {url} with copies of all
 *	underlying strings.
 */
Url
url_copy(
    Url url)
{
    Url result;

    result = url_create();
    result->protocol_name = str_copy(url->protocol_name);
    result->protocol_colon = str_copy(url->protocol_colon);
    result->host_slash_slash = str_copy(url->host_slash_slash);
    result->host_name = str_copy(url->host_name);
    result->port_colon = str_copy(url->port_colon);
    result->port_number = str_copy(url->port_number);
    result->path_head_slash = str_copy(url->path_head_slash);
    result->path_twiddle = str_copy(url->path_twiddle);
    result->path_user = str_copy(url->path_user);
    result->path_user_slash = str_copy(url->path_user_slash);
    result->path = str_copy(url->path);
    result->path_tail_slash = str_copy(url->path_tail_slash);
    result->document = str_copy(url->document);
    result->suffix_period = str_copy(url->suffix_period);
    result->suffix_name = str_copy(url->suffix_name);
    result->anchor_query_prefix = str_copy(url->anchor_query_prefix);
    result->anchor_query = str_copy(url->anchor_query);
    return result;
}

/*
 * url_copy_replace_document(url, document)
 *	This routine will return a deep copy of {url} where the document
 *	name has been replaced by {document}.
 */
Url
url_copy_replace_document(
    Url url,
    Str document)
{
    Url new;

    new = url_copy(url);
    str_free(new->document);
    new->document = document;
    return new;
}

/*
 * url_db_file_name(url, config)
 *	This routine will return the data base file name associated with
 *	{local_url} using {config}.
 */
Str
url_db_file_name(
    Url local_url,
    Config config)
{
    Url annote_url;
    Str annote_file_name;
    Str db_file_name;

    annote_url = url_annote(local_url, config);
    annote_file_name = url_str_local_path(annote_url, config);
    db_file_name = str_printf("%s.db", annote_file_name);
    return db_file_name;
}

/*
 * url_document(url)
 *	This routine will return document name associated with {url}.
 */
Str
url_document(
    Url url)
{
    return url->document;
}

/*
 * url_document_equal(url1, url2)
 *	This routine will return 1 if {url1} and {url2} both point to
 *	the same document and 0 otherwise.
 */
int
url_document_equal(
    Url url1,
    Url url2)
{
    if (str_equal(url1->protocol_name, url2->protocol_name) &&
      str_equal(url1->protocol_colon, url2->protocol_colon) &&
      str_equal(url1->host_slash_slash, url2->host_slash_slash) &&
      str_equal(url1->host_name, url2->host_name) &&
      str_equal(url1->port_colon, url2->port_colon) &&
      str_equal(url1->port_number, url2->port_number) &&
      str_equal(url1->path_head_slash, url2->path_head_slash) &&
      str_equal(url1->path_twiddle, url2->path_twiddle) &&
      str_equal(url1->path_user, url2->path_user) &&
      str_equal(url1->path_user_slash, url2->path_user_slash) &&
      str_equal(url1->path, url2->path) &&
      str_equal(url1->path_tail_slash, url2->path_tail_slash) &&
      str_equal(url1->document, url2->document) &&
      str_equal(url1->suffix_period, url2->suffix_period) &&
      str_equal(url1->suffix_name, url2->suffix_name) &&
      str_equal(url1->anchor_query_prefix, url2->anchor_query_prefix) &&
      str_equal(url1->anchor_query, url2->anchor_query)) {
	return 1;
    }
    return 0;
}

/*
 * url_host_name(url)
 *	This routine will return the host name associated with {url}.
 */
Str
url_host_name(
    Url url)
{
    return url->host_name;
}

/*
 * url_is_http(url)
 *	This routine will return 1 if {url} uses the HTTP protocol
 *	and 0 otherwise.
 */
int
url_is_http(
    Url url)
{
    if (str_equal(url->protocol_name, (Str)"http")) {
	return 1;
    }
    return 0;
}

/*
 * url_is_valid(url)
 *	This routine will return 1 if every field in {url} is non-NULL
 *	and 0 otherwise.
 */
static int
url_is_valid(
    Url url)
{
    if ((url->protocol_name == (Str)0) ||
      (url->protocol_colon == (Str)0) ||
      (url->host_slash_slash == (Str)0) ||
      (url->host_name == (Str)0) ||
      (url->port_colon == (Str)0) ||
      (url->port_number == (Str)0) ||
      (url->path_head_slash == (Str)0) ||
      (url->path_twiddle == (Str)0) ||
      (url->path_user == (Str)0) ||
      (url->path_user_slash == (Str)0) ||
      (url->path == (Str)0) ||
      (url->path_tail_slash == (Str)0) ||
      (url->document == (Str)0) ||
      (url->suffix_period == (Str)0) ||
      (url->suffix_name == (Str)0) ||
      (url->anchor_query_prefix == (Str)0) ||
      (url->anchor_query == (Str)0)) {
	return 0;	/* Plant breakpoint here and print "*url" */
    }
    return 1;
}

/*
 * url_is_local(url, config)
 *	This routine will return 1 if {url} is located on the local
 *	machine as specified in {config}.
 */
int
url_is_local(
    Url url,
    Config config)
{
    Chr chr;
    Str host_name;
    Str host_name_config;
    unsigned size;

    /*
     * The host name in {config} should be the fully qualified DNS host
     * name of the local machine.  The host name in {url} may not be fully
     * qualified, so we want to match the substring of the full DNS host
     * name.
     */
    host_name = url->host_name;
    if (host_name == (Str)0){
	return 0;
    }
    size = str_size(host_name);
    host_name_config = config_host_name(config);
    if (size > str_size(host_name_config)) {
	return 0;
    }
    chr = host_name_config[size];
    if ((chr != (Chr)'.') && (chr != (Chr)'\0')) {
	return 0;
    }
    return str_case_equal_max(host_name, host_name_config, size);
}

/*
 * url_make_absolute(url, base_url)
 *	This routine will return an absolute URL for {url} using {base_url}
 *	as the base.  If {url} is already an absolute URL, a copy of
 *	it is returned instead.
 */
Url
url_make_absolute(
    Url url,
    Url base_url)
{
    Url result;

    result = url_copy(url);
    if (str_is_empty(url->protocol_name)) {
	/* Relative URL */
	result->protocol_name = str_copy(base_url->protocol_name);
	result->protocol_colon = str_copy(base_url->protocol_colon);
	if (str_is_empty(url->host_slash_slash)) {
	    result->host_slash_slash = str_copy(base_url->host_slash_slash);
	    result->host_name = str_copy(base_url->host_name);
	    result->port_colon = str_copy(base_url->port_colon);
	    result->port_number = str_copy(base_url->port_number);
	}
	if (str_is_empty(url->path_head_slash)) {
	    result->path_head_slash = str_copy(base_url->path_head_slash);
	    result->path_twiddle = str_copy(base_url->path_twiddle);
	    result->path_user = str_copy(base_url->path_user);
	    result->path_user_slash = str_copy(base_url->path_user_slash);
	    result->path = str_printf("%s/%s", base_url->path, url->path);
	    result->path_tail_slash = str_copy(base_url->path_tail_slash);
	}
    }
    return result;
}

/*
 * url_original(url)
 *	This routine will return a {Url} that can used to access the
 * 	original unannotated version of {url}.
 */
Url
url_original(
    Url url,
    Config config)
{
    Str old_path;
    Str new_path;

    url = url_copy(url);
    old_path = url->path;
    new_path = str_printf("ORIGINAL/%s", old_path);
    str_free(old_path);
    url->path = new_path;
    return url;
}

/*
 * url_parse(str)
 *	This routine parse {str} into a {Url} object and return it.
 *	If there are any parsing errors, (Url)0 is returned.
 */
Url
url_parse(
    Str str)
{
    Str colon;
    Str colon_or_slash;
    Str period;
    Str sharp_or_question;
    Str slash;
    Url url;

    url = url_create();

    /* Parse any protocol name: */
    colon = str_span(str, PROTOCOL_CHARACTERS);
    if (colon[0] == (Chr)':') {
	/* Have protocol: */
	url->protocol_name = str_copy_max(str, colon - str);
	url->protocol_colon = str_copy_max(colon, 1);
	str = colon + 1;
    } else {
	/* No protocol: */
	url->protocol_name = str_create_empty();
	url->protocol_colon = str_create_empty();
    }

    /* Parse any host name: */
    if (str_equal_max(str, (Str)"//", 2) ||
      str_equal_max(str, (Str)"\\\\", 2)) {
	/* Have host name: */
	url->host_slash_slash = str_copy_max(str, 2);
	str += 2;
	colon_or_slash = str_span(str, HOST_NAME_CHARACTERS);
	url->host_name = str_copy_max(str, colon_or_slash - str);
	str = colon_or_slash;
	if (colon_or_slash[0] == (Chr)':') {
	    /* Have port: */
	    url->port_colon = str_copy_max(colon_or_slash, 1);
	    slash = str_span(str, PORT_CHARACTERS);
	    url->port_number = str_copy_max(colon + 1, slash - colon);
	} else {
	    /* No port: */
	    url->port_colon = str_create_empty();
	    url->port_number = str_create_empty();
	}
    } else {
	/* No host name: */
	url->host_slash_slash = str_create_empty();
	url->host_name = str_create_empty();
	url->port_colon = str_create_empty();
	url->port_number = str_create_empty();
    }

    /* Parse any anchor or query: */
    sharp_or_question = str_break(str, (Str)"#?");
    if (sharp_or_question == (Str)0) {
	/* No anchor or query: */
	url->anchor_query_prefix = str_create_empty();
	url->anchor_query = str_create_empty();
    } else {
	/* Have an anchor or query: */
	url->anchor_query_prefix = str_copy_max(sharp_or_question, 1);
	url->anchor_query = str_copy(sharp_or_question + 1);

	/* Temporarily truncate source string; restore before returning: */
	sharp_or_question[0] = '\0';
    }	

    /* Parse the path: */
    if ((str[0] == (Chr)'/') || (str[0] == (Chr)'\\')) {
	/* Have absolute path: */
	url->path_head_slash = str_copy_max(str, 1);
	str += 1;
    } else {
	/* Relative path: */
	url->path_head_slash = str_create_empty();
    }
    if (str[0] == (Chr)'~') {
	/* Have User name: */
	url->path_twiddle = str_copy_max(str, 1);
	str += 1;
	slash = str_break(str, (Str)"/\\");
	if (slash == (Str)0) {
	    /* No trailing slash: */
	    url->path_user = str_copy(str);
	    url->path_user_slash = str_create_empty();
	    str += str_size(str);
	} else {
	    /* Trailing slash: */
	    url->path_user = str_copy_max(str, slash - str);
	    url->path_user_slash = str_copy_max(slash, 1);
	    str = slash + 1;
	}
    } else {
	/* No user name: */
	url->path_twiddle = str_create_empty();
	url->path_user = str_create_empty();
	url->path_user_slash = str_create_empty();
    }

    /* Parse rest of path up to document start: */
    slash = str_chr_reverse_search(str, (Chr)'/');	/* Try Unix first */
    if (slash == (Str)0) {
	/* Try a MS DOS next: */
	slash = str_chr_reverse_search(str, (Chr)'\\');
    }
    if ((slash != (Str)0) && (slash < str)) {
	/* Found slash that has already been accounted for: */
	slash = (Str)0;		/* Make believe none was found. */
    }
    if (slash == (Str)0) {
	/* Empty path: */
	url->path = str_create_empty();
	url->path_tail_slash = str_create_empty();
    } else {
	/* Non-empty path: */
	url->path = str_copy_max(str, slash - str);
	url->path_tail_slash = str_copy_max(slash, 1);
	str = slash + 1;
    }
    
    /* Parse the document and suffix: */
    period = str_chr_reverse_search(str, (Chr)'.');
    if (period == (Str)0) {
	/* No suffix: */
	url->document = str_copy(str);
	url->suffix_period = str_create_empty();
	url->suffix_name = str_create_empty();
    } else {
	/* Have suffix: */
	url->document = str_copy_max(str, period - str);
	url->suffix_period = str_copy_max(period, 1);
	url->suffix_name = str_copy(period + 1);
    }

    /* Restore any temporarily truncated source string: */
    if (sharp_or_question != (Str)0) {
	sharp_or_question[0] = url->anchor_query_prefix[0];
    }

    assert(url_is_valid(url));
    return url;
}

/*
 * url_port(url)
 *	This routine will return the port associated with {url}.
 */
unsigned
url_port(
    Url url)
{
    Str port_number;

    port_number = url->port_number;
    if (str_is_empty(port_number)) {
	return 0;
    }
    return atoi((char *)port_number);
}

/*
 * url_restore(in_file)
 *	This routine will read a {Url} object from {in_file} that was
 *	written by {str_save()}.
 */
Url
url_restore(
    FILE *in_file)
{
    Str str;
    Url url;

    str = str_restore(in_file);
    if (str == (Str)0) {
	return (Url)0;
    }
    url = url_parse(str);
    return url;
}

/*
 * url_save(url, out_file)
 *	This routine will write {url} to {out_file} such that it can
 *	be restored using {url_restore}.
 */
void
url_save(
    Url url,
    FILE *out_file)
{
    if (url == (Url)0) {
	(void)fprintf(out_file, "0");
    } else {
	(void)fprintf(out_file, "\"");
	url_write(url, out_file);
	(void)fprintf(out_file, "\"");
    }
}

/*
 * url_str_full_path(url)
 *	This routine will allocate and return a string containing
 *	the full path portion from {url} excluding anchor or query.
 */
Str
url_str_full_path(
    Url url)
{
    Str result;

    result = str_printf("%s%s%s%s%s%s%s%s%s",
      url->path_head_slash,
      url->path_twiddle,
      url->path_user,
      url->path_user_slash,
      url->path,
      url->path_tail_slash,
      url->document,
      url->suffix_period,
      url->suffix_name);
    return result;
}

/*
 * url_str_local_path(url, config)
 *	This routine will return the local file path associated with
 *	{url} using {config}.
 */
Str
url_str_local_path(
    Url url,
    Config config)
{
    Str result;

    /* This code won't work on MS DOS: */

    if (str_is_empty(url->path_twiddle)) {
	/* Absolute path: */
	result = str_printf("%s/%s/%s%s%s",
	  config_http_root(config),
	  url->path,
	  url->document,
	  url->suffix_period,
	  url->suffix_name);
    } else {
	/* User path: */
	struct passwd *password;

	password = getpwnam((char *)url->path_user);
	if (password == (struct passwd *)0) {
	    /* This is pretty dubious, but it won't hurt anything: */
	    result = str_printf("/home/%s/public_html/%s/s%s%s",
	      url->path_user,
	      url->path,
	      url->document,
	      url->suffix_period,
	      url->suffix_name);
        } else {
	    result = str_printf("%s/public_html/%s/%s%s%s",
	      password->pw_dir,
	      url->path,
	      url->document,
	      url->suffix_period,
	      url->suffix_name);
	}
    }
    return result;
}

/*
 * url_str(url)
 *	This routine will return a newly allocated string that
 *	contains the contents of {url}.
 */
Str
url_str(
    Url url)
{
    Str str;

    str = str_printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
      url->protocol_name,
      url->protocol_colon,
      url->host_slash_slash,
      url->host_name,
      url->port_colon,
      url->port_number,
      url->path_head_slash,
      url->path_twiddle,
      url->path_user,
      url->path_user_slash,
      url->path,
      url->path_tail_slash,
      url->document,
      url->suffix_period,
      url->suffix_name,
      url->anchor_query_prefix,
      url->anchor_query);
    return str;
}

/*
 * url_write(url, out_file)
 *	This routine will write {url} to {out_file}.
 */
void
url_write(
    Url url,
    FILE *out_file)
{
    (void)fprintf(out_file,
      "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
      url->protocol_name,
      url->protocol_colon,
      url->host_slash_slash,
      url->host_name,
      url->port_colon,
      url->port_number,
      url->path_head_slash,
      url->path_twiddle,
      url->path_user,
      url->path_user_slash,
      url->path,
      url->path_tail_slash,
      url->document,
      url->suffix_period,
      url->suffix_name,
      url->anchor_query_prefix,
      url->anchor_query);
}

