jlog/scripts/url_syntax.php

<?php

function check_url ($url2check, $types) {

# Be paranoid about using grouping!

$nz_digit       =  '[1-9]';
$nz_digits      =  "(?:$nz_digit\\d*)";
$digits         =  '(?:\d+)';
$space          =  '(?:%20)';
$nl             =  '(?:%0[Aa])';
$dot            =  '\.';
$plus           =  '\+';
$qm             =  '\?';
$ast            =  '\*';
$hex            =  '[a-fA-F\d]';
$alpha          =  '[a-zA-Z]';     # No, no locale.
$alphas         =  "(?:${alpha}+)";
$alphanum       =  '[a-zA-Z\d]';   # Letter or digit.
$xalphanum      =  "(?:${alphanum}|%(?:3\\d|[46]$hex|[57][Aa\\d]))";
                      # Letter or digit, or hex escaped letter/digit.
$alphanums      =  "(?:${alphanum}+)";
$escape         =  "(?:%$hex\{2})";
$safe           =  '[$\-_.+]';
$extra          =  "[\!*'(),]";
$national       =  '[{}|\\^~[\]`]';
$punctuation    =  '[<>#%"]';
$reserved       =  '[;/?:@&=]';
$uchar          =  "(?:${alphanum}|${safe}|${extra}|${escape})";
$xchar          =  "(?:${alphanum}|${safe}|${extra}|${reserved}|${escape})";
$uchar          =  str_replace (']|[', '', $uchar);  // Make string smaller, and speed up regex.
$uchar          =  str_replace (']|[', '', $xchar);  // Make string smaller, and speed up regex.

# URL schemeparts for ip based protocols:
$user           =  "(?:(?:${uchar}|[;?&=])*)";
$password       =  "(?:(?:${uchar}|[;?&=])*)";
$hostnumber     =  "(?:${digits}(?:${dot}${digits}){3})";
$toplabel       =  "(?:${alpha}(?:(?:${alphanum}|-)*${alphanum})?)";
$domainlabel    =  "(?:${alphanum}(?:(?:${alphanum}|-)*${alphanum})?)";
$hostname       =  "(?:(?:${domainlabel}${dot})*${toplabel})";
$host           =  "(?:${hostname}|${hostnumber})";
$hostport       =  "(?:${host}(?::${digits})?)";
$login          =  "(?:(?:${user}(?::${password})?\@)?${hostport})";

# The predefined schemes:

# FTP (see also RFC959)
$fsegment       =  "(?:(?:${uchar}|[?:\@&=])*)";
$fpath          =  "(?:${fsegment}(?:/${fsegment})*)";
$ftpurl         =  "(?:ftp://${login}(?:/${fpath}(?:;type=[AIDaid])?)?)";

# FILE
$fileurl        =  "(?:file://(?:${host}|localhost)?/${fpath})";

# HTTP
$hsegment       =  "(?:(?:${uchar}|[~;:\@&=])*)";
$search         =  "(?:(?:${uchar}|[;:\@&=])*)";
$hpath          =  "(?:${hsegment}(?:/${hsegment})*)";
$httpurl        =  "(?:https?://${hostport}(?:/${hpath}(?:${qm}${search})?)?)";

# GOPHER (see also RFC1436)
$gopher_plus    =  "(?:${xchar}*)";
$selector       =  "(?:${xchar}*)";
$gtype          =      $xchar;     // Omitted parens!
$gopherurl      =  "(?:gopher://${hostport}(?:/${gtype}(?:${selector}" .
                      "(?:%09${search}(?:%09${gopher_plus})?)?)?)?)";

# MAILTO (see also RFC822)
$encoded822addr =  "(?:$xchar+)";
$mailtourl      =  "(?:mailto:$encoded822addr)";
$mailtonpurl    =  $encoded822addr;

# NEWS (see also RFC1036)
$article        =  "(?:(?:${uchar}|[;/?:&=])+\@${host})";
$group          =  "(?:${alpha}(?:${alphanum}|[_.+-])*)";
$grouppart      =  "(?:${article}|${group}|${ast})";
$newsurl        =  "(?:news:${grouppart})";

# NNTP (see also RFC977)
$nntpurl        =  "(?:nntp://${hostport}/${group}(?:/${digits})?)";

# TELNET
$telneturl      =  "(?:telnet://${login}/?)";

# WAIS (see also RFC1625)
$wpath          =  "(?:${uchar}*)";
$wtype          =  "(?:${uchar}*)";
$database       =  "(?:${uchar}*)";
$waisdoc        =  "(?:wais://${hostport}/${database}/${wtype}/${wpath})";
$waisindex      =  "(?:wais://${hostport}/${database}${qm}${search})";
$waisdatabase   =  "(?:wais://${hostport}/${database})";
# $waisurl        =  "(?:${waisdatabase}|${waisindex}|${waisdoc})";
# Speed up: the 3 types share a common prefix.
$waisurl        =  "(?:wais://${hostport}/${database}" .
                         "(?:(?:/${wtype}/${wpath})|${qm}${search})?)";

# PROSPERO
$fieldvalue     =  "(?:(?:${uchar}|[?:\@&])*)";
$fieldname      =  "(?:(?:${uchar}|[?:\@&])*)";
$fieldspec      =  "(?:;${fieldname}=${fieldvalue})";
$psegment       =  "(?:(?:${uchar}|[?:\@&=])*)";
$ppath          =  "(?:${psegment}(?:/${psegment})*)";
$prosperourl    =  "(?:prospero://${hostport}/${ppath}(?:${fieldspec})*)";

# LDAP (see also RFC1959)
# First. import stuff from RFC 1779 (Distinguished Names).
# We've modified things a bit.
$dn_separator        =  "(?:[;,])";
$dn_optional_space   =  "(?:${nl}?${space}*)";
$dn_spaced_separator =  "(?:${dn_optional_space}${dn_separator}" .
                              "${dn_optional_space})";
$dn_oid              =  "(?:${digits}(?:${dot}${digits})*)";
$dn_keychar          =  "(?:${xalphanum}|${space})";
$dn_key              =  "(?:${dn_keychar}+|(?:OID|oid)${dot}${dn_oid})";
$dn_string           =  "(?:${uchar}*)";
$dn_attribute        =  "(?:(?:${dn_key}${dn_optional_space}=" .
                                          "${dn_optional_space})?${dn_string})";
$dn_name_component   =  "(?:${dn_attribute}(?:${dn_optional_space}" .
                              "${plus}${dn_optional_space}${dn_attribute})*)";
$dn_name             =  "(?:${dn_name_component}" .
                            "(?:${dn_spaced_separator}${dn_name_component})*" .
                               "${dn_spaced_separator}?)";

# RFC 1558 defines the filter syntax, but that requires a PDA to recognize.
# Since that's too powerful for Perl's REs, we allow any char between the
# parenthesis (which have to be there.)
$ldap_filter         =  "(?:\(${xchar}+\))";

# This is from RFC 1777. It defines an attributetype as an 'OCTET STRING',
# whatever that is.
$ldap_attr_type      =  "(?:${uchar}+)";  # I'm just guessing here.
                                             # The RFCs aren't clear.

# Now we are at the grammar of RFC 1959.
$ldap_attr_list =  "(?:${ldap_attr_type}(?:,${ldap_attr_type})*)";
$ldap_attrs     =  "(?:${ldap_attr_list}?)";

$ldap_scope     =  "(?:base|one|sub)";
$ldapurl        =  "(?:ldap://(?:${hostport})?/${dn_name}" .
                        "(?:${qm}${ldap_attrs}" .
                        "(?:${qm}${ldap_scope}(?:${qm}${ldap_filter})?)?)?)";


# RFC 2056 defines the format of URLs for the Z39.50 protocol.
$z_database     =  "(?:${uchar}+)";
$z_docid        =  "(?:${uchar}+)";
$z_elementset   =  "(?:${uchar}+)";
$z_recordsyntax =  "(?:${uchar}+)";
$z_scheme       =  "(?:z39${dot}50[rs])";
$z39_50url      =  "(?:${z_scheme}://${hostport}" .
                           "(?:/(?:${z_database}(?:${plus}${z_database})*" .
                                   "(?:${qm}${z_docid})?)?" .
                               "(?:;esn=${z_elementset})?" .
                               "(?:;rs=${z_recordsyntax}" .
                                    "(?:${plus}${z_recordsyntax})*)?))";


# RFC 2111 defines the format for cid/mid URLs.
$url_addr_spec  =  "(?:(?:${uchar}|[;?:@&=])*)";
$message_id     =  $url_addr_spec;
$content_id     =  $url_addr_spec;
$cidurl         =  "(?:cid:${content_id})";
$midurl         =  "(?:mid:${message_id}(?:/${content_id})?)";


# RFC 2122 defines the Vemmi URLs.
$vemmi_attr     =  "(?:(?:${uchar}|[/?:@&])*)";
$vemmi_value    =  "(?:(?:${uchar}|[/?:@&])*)";
$vemmi_service  =  "(?:(?:${uchar}|[/?:@&=])*)";
$vemmi_param    =  "(?:;${vemmi_attr}=${vemmi_value})";
$vemmiurl       =  "(?:vemmi://${hostport}" .
                          "(?:/${vemmi_service}(?:${vemmi_param}*))?)";

# RFC 2192 for IMAP URLs.
# Import from RFC 2060.
# $imap4_astring       =  "";
# $imap4_search_key    =  "";
# $imap4_section_text  =  "";
$imap4_nz_number     =  $nz_digits;
$achar          =  "(?:${uchar}|[&=~])";
$bchar          =  "(?:${uchar}|[&=~:\@/])";
$enc_auth_type  =  "(?:${achar}+)";
$enc_list_mbox  =  "(?:${bchar}+)";
$enc_mailbox    =  "(?:${bchar}+)";
$enc_search     =  "(?:${bchar}+)";
$enc_section    =  "(?:${bchar}+)";
$enc_user       =  "(?:${achar}+)";
$i_auth         =  "(?:;[Aa][Uu][Tt][Hh]=(?:${ast}|${enc_auth_type}))";
$i_list_type    =  "(?:[Ll](?:[Ii][Ss][Tt]|[Ss][Uu][Bb]))";
$i_mailboxlist  =  "(?:${enc_list_mbox}?;[Tt][Yy][Pp][Ee]=${i_list_type})";
$i_uidvalidity  =  "(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=" .
                          "${imap4_nz_number})";
$i_messagelist  =  "(?:${enc_mailbox}(?:${qm}${enc_search})?" .
                                       "(?:${i_uidvalidity})?)";
$i_section      =  "(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn]=${enc_section})";
$i_uid          =  "(?:/;[Uu][Ii][Dd]=${imap4_nz_number})";
$i_messagepart  =  "(?:${enc_mailbox}(?:${i_uidvalidity})?${i_uid}" .
                                       "(?:${i_section})?)";
$i_command      =  "(?:${i_mailboxlist}|${i_messagelist}|${i_messagepart})";
$i_userauth     =  "(?:(?:${enc_user}(?:${i_auth})?)|" .
                         "(?:${i_auth}(?:${enc_user})?))";
$i_server       =  "(?:(?:${i_userauth}\@)?${hostport})";
$imapurl        =  "(?:imap://${i_server}/(?:$i_command)?)";

# RFC 2224 for NFS.
$nfs_mark       =  '[\$\-_.\!~*\'(),]';
$nfs_unreserved =  "(?:${alphanum}|${nfs_mark})";
$nfs_unreserved =  str_replace (']|[', '', $nfs_unreserved);  // Make string smaller, and speed up regex.
$nfs_pchar      =  "(?:${nfs_unreserved}|${escape}|[:\@&=+])";
$nfs_segment    =  "(?:${nfs_pchar}*)";
$nfs_path_segs  =  "(?:${nfs_segment}(?:/${nfs_segment})*)";
$nfs_url_path   =  "(?:/?${nfs_path_segs})";
$nfs_rel_path   =  "(?:${nfs_path_segs}?)";
$nfs_abs_path   =  "(?:/${nfs_rel_path})";
$nfs_net_path   =  "(?://${hostport}(?:${nfs_abs_path})?)";
$nfs_rel_url    =  "(?:${nfs_net_path}|${nfs_abs_path}|${nfs_rel_path})";
$nfsurl         =  "(?:nfs:${nfs_rel_url})";

$valid_types = array (
  'http', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'wais', 'mailto',
  'mailtonp', 'file', 'prospero', 'ldap', 'z39_50', 'cid', 'mid', 'vemmi',
  'imap', 'nfs'
);

# Combining all the different URL formats into a single regex.

$valid = false;

if (!is_array ($types)) {
    $types = array ($types);
}

foreach ($types as $type) {
    if (!in_array ($type, $valid_types)) {
        continue;
    }
    $re = $type.'url';
    if (preg_match ('!^'.$$re.'$!i', $url2check)) {
        $valid = $type;
        break;
    }
}

return $valid;

}

?>