// Sample HTML Document parsing - extraction of URLs
//
// functions and data structures to suport the extraction of
// URLS from a HTML document (from the hyperlinks found in a HTML document).
//
// also support for parsing URLs in to hostname, port and uri components
//
// This codes uses the Unix (Posix) regex library to do most of the work.
//
//
// NOTE: There is a sample main() included at the bottom of this file,
// (it is commented out).
// The sample main runs the parsing code on a file (the name of the file is
// taken from the command line). You need to uncomment the main and compile
// this file to test out the main program.
//
// Questions should be sent to netprog@cs.rpi.edu.
#include
#include
#include
#include
#include
#include
// data structure that is used to hold the components of a URL.
typedef struct {
char *protocol;
char *host;
int port;
char *uri;
} urlrec;
int urlrecs=0;
// cleanup function - will call free on all parts of a urlrec
void urlrec_cleanup(urlrec *u) {
if (u->protocol) free(u->protocol);
if (u->host) free(u->host);
if (u->uri) free(u->uri);
free(u);
urlrecs--;
}
// convert from urlrec to string (max 1000)
char *urlrec_to_string(urlrec u) {
char tmp[1000];
if ((u.protocol) &&(strcmp(u.protocol,"http")==0)) {
if (u.port==80) {
snprintf(tmp,1000,"http://%s%s",u.host,u.uri);
} else {
snprintf(tmp,1000,"http://%s:%d%s",u.host,u.port,u.uri);
}
} else {
if (u.protocol)
snprintf(tmp,1000,"%s:%s\n",u.protocol,u.uri);
else
snprintf(tmp,1000,"http:%s\n",u.uri);
}
return(strdup(tmp));
}
// extract_match is used after a call to regexec finds a match.
// this function just grabs the substring matched by part of
// the regular expression (as reported by regexec).
//
// extract the regex match specified by a regmatch record from
// the string str. Returns NULL if there was no match
// returns a malloc'd copy of the part of the string that matched
char *extract_match( regmatch_t pm, char *str) {
int len;
char *tmp;
// check for a match
if (pm.rm_so==-1) {
return(NULL); // there was no match
} else {
// get the length of the match
len = pm.rm_eo - pm.rm_so;
tmp = malloc(len+1);
strncpy(tmp,str+pm.rm_so,len);
tmp[len]=0;
return(tmp);
}
}
// parse_url rips apart a URL string and builds a url struct
//
// if the URL is a relative url (no protocol or host specified)
// the protocol, host and port are set match the base url
//
// if there is no port specified, the default of 80 is used.
//
// This uses the regex library - uses a regular expression
// to isolate each of the parts of a url.
urlrec *parse_url(char *url, urlrec *base) {
// define the url extraction regular expression
const char *urlparse_regex="(([a-z]+):(//([^:/]+)(:([0-9]+))?)?)?(/?[^ \t\r\n]*)";
// Some constants that define which subexpression matches the
// various parts of the string
#define R_PROTO 2
#define R_HOST 4
#define R_PORT 6
#define R_URI 7
// static variable used to avoid recompiling the regular expression
// every time.
static regex_t r;
static int firsttime=1;
regmatch_t pm[10];
int len;
char *tmp;
urlrec *u;
// compile the regular expression if this is the first time called
if (firsttime) {
if ( regcomp(&r,urlparse_regex,REG_EXTENDED|REG_ICASE) ) {
fprintf(stderr,"Error compiling regular expression for parse_url\n");
exit(1);
}
firsttime=0;
}
// call regexec to match the regular expression to the string url
if (regexec(&r,url,10,pm,REG_NOTBOL)!=0) {
// no match found - just return null
return(NULL);
}
// create a url structure to hold the result.
// NOTE: the caller is responsible for freeing this structure!
// (use urlrec_cleanup)
u = malloc(sizeof(urlrec));
u->protocol = u->host = u->uri = NULL;
urlrecs++;
if (u->protocol = extract_match(pm[R_PROTO],url)) {
// a protocol was specified, so this must be an absolute URL
if (u->host = extract_match(pm[R_HOST],url)) {
tmp = extract_match(pm[R_PORT],url);
if (tmp) {
u->port = atoi(tmp); // found a port number
} else {
u->port=80; // no port - default is 80
}
} else {
u->host = strdup(base->host);
u->port = base->port;
// u->port=0; // no host - set port to NULL (0)
}
u->uri=extract_match(pm[R_URI],url); // grab the URI
} else {
// no protocol - must be a relative URL.
// use the base host and port and set protocol to http
if (base==NULL) {
// invalid hyperlink - just return null
printf("INVALID LINK\n");
urlrec_cleanup(u);
return(NULL);
}
u->protocol = strdup("http");
u->host = strdup(base->host);
u->port = base->port;
// get the uri part
tmp = extract_match(pm[R_URI],url);
if (*tmp=='/') {
// leading slash - don't add base uri
u->uri = tmp;
} else {
// relative uri - need to prepend the base uri
// after taking off the last component of the base uri
// (if it doesn't end with /)
// also need to watch for '#' links!
u->uri = malloc(strlen(base->uri)+strlen(tmp)+1);
strcpy(u->uri,base->uri);
len = strlen(u->uri);
if (tmp[0]!='#') {
while ((len>0) && (u->uri[len] != '/')) {
u->uri[len]=0;
len--;
}
if (len==0) u->uri[0] = '/';
}
strcat(u->uri,tmp);
}
}
return(u);
}
// Extract just the target URL from an A tag, return null if
// no valid target was found. This function assumes the input string
// is a complete A tag including the < and >.
// allocates memory for the return value, so the caller is responsible
// for freeing the buffer (use free()).
char *extract_url(char *atag) {
const char *hrefregex="HREF[ \t\r\n]*=[ \t\r\n]*\"*([^>\"]+)";
static int firsttime=1;
static regex_t reg;
regmatch_t pm[2];
int len;
char *tmp;
// compile the regular expression if this is the first time called
if (firsttime) {
if ( regcomp(®,hrefregex,REG_EXTENDED|REG_ICASE) ) {
fprintf(stderr,"Error compiling regular expression for extract_url\n");
exit(1);
}
firsttime=0;
}
// Call regexec to find what part of the string matches the regular expression
if (regexec(®,atag,2,pm,REG_NOTBOL)!=0) {
// no match found - just return null
return(NULL);
}
// we need the first subexpression, so look in pm[1]
if (pm[1].rm_so==-1) {
// we didn't get a match
return(NULL);
}
// get the length of the match
len = pm[1].rm_eo - pm[1].rm_so;
tmp = malloc(len+1);
if (!tmp) {
// memory error
fprintf(stderr,"Memory allocation error\n");
exit(1);
}
strncpy(tmp,atag+pm[1].rm_so, len);
tmp[len]=0;
return(tmp);
}
// -------------------------------------------------------------
// data structure used to return a list of hyperlinks extracted
// from an HTML document.
typedef struct linkrec {
char *url; // the url itself (target of the link)
struct linkrec *next; // pointer to next record
} linkrec;
// lnkcnt, inc, dec, report are just used for debugging (to make sure
// all allocated linkrecs and urlrecs are released with free).
int lnkcnt=0;
void lnkinc(void) {
lnkcnt++;
}
void lnkdec(void) {
lnkcnt--;
}
void report(void) {
printf("URLS: %d\n",urlrecs);
printf("LRECS: %d\n",lnkcnt);
}
// extract_links returns a linked list of the hyperlink targets
// (URLs ) found in an HTML document. The linked list is null terminated.
//
// everything is dynamically allocated, so you need to free
// everything when you are done with it.
linkrec *extract_links(char *buf) {
static int firsttime=1;
const char *linkregex="<[aA][ \t\n\r][^>]*>";
static regex_t reg;
regmatch_t pm;
char *p;
char tmp[1000];
linkrec *l,*head;
int nlinks=0;
// only compile the regular expression the first time we use it
if (firsttime) {
if ( regcomp(®,linkregex,REG_EXTENDED|REG_ICASE)) {
fprintf(stderr,"Error - invalid regexp\n");
exit(1);
}
firsttime=0;
}
// initialize the links array to be of size 1. This allows
// us to store the terminating null pointer (that tells the
// caller when it's hit the end of the array).
l=head=NULL;
pm.rm_eo = pm.rm_so=0;
while (regexec(®,buf,1,&pm,REG_NOTBOL)==0) {
if (pm.rm_eo-pm.rm_so>1000) {
fprintf(stderr,"Error - stumbled across a link to big to handle\n");
exit(1);
}
strncpy(tmp,buf+pm.rm_so,pm.rm_eo-pm.rm_so);
tmp[pm.rm_eo-pm.rm_so]=0;
buf+=pm.rm_eo;
// got one A tag - extract the target
p = extract_url(tmp);
// if we really found a URL - add it to the list
if (p!=NULL) {
if (head==NULL) {
head=l=malloc(sizeof(linkrec)); lnkinc();
if (!head) {
fprintf(stderr,"Error - ran out of memory\n");
exit(1);
}
head->next=NULL;
head->url=p;
} else {
l->next = malloc(sizeof(linkrec)); lnkinc();
if (l->next==NULL) {
fprintf(stderr,"Error - ran out of memory\n");
exit(1);
}
l = l->next;
l->next=NULL;
l->url = p;
}
}
}
return(head);
}
/*
// Sample main (used only to test the above code).
// This main expects a single command line argument specifying
// the name of a file containing HTML. The main reads in the file
// and extracts (and prints out) all the hyperlink targets in the file.
// a function that reads in a complete file and returns it
// as one big long string. Not that efficient (just calls
// realloc over and over...)
//
char *getfile(char *name) {
int fd;
char *buf;
char tmp[1000];
int size=0;
int n;
fd = open(name,O_RDONLY);
if (fd<0) {
fprintf(stderr,"Error opening file\n");
exit(1);
}
buf = malloc(1001);
while ((n=read(fd,buf+size,1000))>0) {
size+=n;
buf=realloc(buf,size+1001);
if (buf==NULL) {
printf("Memory Error\n");
exit(1);
}
}
buf[size]=0;
return(buf);
}
/*
// Sample main that can be used to test the parsing code.
int main(int argc, char **argv) {
linkrec *lr,*tmp;
char *p;
int i;
urlrec *u;
urlrec *dmy;
if (argc!=2) {
fprintf(stderr,"Error - supply a filename\n");
exit(1);
}
// read in the entire file
p = getfile(argv[1]);
// extract hyperlinks from the file
lr = extract_links(p);
// use a phoney URL as the base (just to make sure the
// resolution of relative URLs works)
dmy = parse_url("http://dmy.com/",NULL);
printf("Here are the links:\n");
while (lr) {
printf("%s\n",lr->url);
u = parse_url(lr->url,dmy);
if (u->protocol) {
printf("\tPROTOCOL %s\n",u->protocol);
if (u->host) {
printf("\tHOST:%s\n\tPORT: %d\n",u->host,u->port);
}
}
printf("\tURI:%s\n",u->uri);
// need to free stuff in the urlrec
urlrec_cleanup(u);
// and the linked list element
free(lr->url);
tmp=lr;
lr = lr->next;
free(tmp);
}
}
*/