// Sample HTML Document parsing - extraction of URLs // // functions and data structures to suport the extraction of // URLS from a HTML document (from the hyperlinks found in a HTML document). // // also support for parsing URLs in to hostname, port and uri components // // This codes uses the Unix (Posix) regex library to do most of the work. // // // NOTE: There is a sample main() included at the bottom of this file, // (it is commented out). // The sample main runs the parsing code on a file (the name of the file is // taken from the command line). You need to uncomment the main and compile // this file to test out the main program. // // Questions should be sent to netprog@cs.rpi.edu. #include #include #include #include #include #include // data structure that is used to hold the components of a URL. typedef struct { char *protocol; char *host; int port; char *uri; } urlrec; int urlrecs=0; // cleanup function - will call free on all parts of a urlrec void urlrec_cleanup(urlrec *u) { if (u->protocol) free(u->protocol); if (u->host) free(u->host); if (u->uri) free(u->uri); free(u); urlrecs--; } // convert from urlrec to string (max 1000) char *urlrec_to_string(urlrec u) { char tmp[1000]; if ((u.protocol) &&(strcmp(u.protocol,"http")==0)) { if (u.port==80) { snprintf(tmp,1000,"http://%s%s",u.host,u.uri); } else { snprintf(tmp,1000,"http://%s:%d%s",u.host,u.port,u.uri); } } else { if (u.protocol) snprintf(tmp,1000,"%s:%s\n",u.protocol,u.uri); else snprintf(tmp,1000,"http:%s\n",u.uri); } return(strdup(tmp)); } // extract_match is used after a call to regexec finds a match. // this function just grabs the substring matched by part of // the regular expression (as reported by regexec). // // extract the regex match specified by a regmatch record from // the string str. Returns NULL if there was no match // returns a malloc'd copy of the part of the string that matched char *extract_match( regmatch_t pm, char *str) { int len; char *tmp; // check for a match if (pm.rm_so==-1) { return(NULL); // there was no match } else { // get the length of the match len = pm.rm_eo - pm.rm_so; tmp = malloc(len+1); strncpy(tmp,str+pm.rm_so,len); tmp[len]=0; return(tmp); } } // parse_url rips apart a URL string and builds a url struct // // if the URL is a relative url (no protocol or host specified) // the protocol, host and port are set match the base url // // if there is no port specified, the default of 80 is used. // // This uses the regex library - uses a regular expression // to isolate each of the parts of a url. urlrec *parse_url(char *url, urlrec *base) { // define the url extraction regular expression const char *urlparse_regex="(([a-z]+):(//([^:/]+)(:([0-9]+))?)?)?(/?[^ \t\r\n]*)"; // Some constants that define which subexpression matches the // various parts of the string #define R_PROTO 2 #define R_HOST 4 #define R_PORT 6 #define R_URI 7 // static variable used to avoid recompiling the regular expression // every time. static regex_t r; static int firsttime=1; regmatch_t pm[10]; int len; char *tmp; urlrec *u; // compile the regular expression if this is the first time called if (firsttime) { if ( regcomp(&r,urlparse_regex,REG_EXTENDED|REG_ICASE) ) { fprintf(stderr,"Error compiling regular expression for parse_url\n"); exit(1); } firsttime=0; } // call regexec to match the regular expression to the string url if (regexec(&r,url,10,pm,REG_NOTBOL)!=0) { // no match found - just return null return(NULL); } // create a url structure to hold the result. // NOTE: the caller is responsible for freeing this structure! // (use urlrec_cleanup) u = malloc(sizeof(urlrec)); u->protocol = u->host = u->uri = NULL; urlrecs++; if (u->protocol = extract_match(pm[R_PROTO],url)) { // a protocol was specified, so this must be an absolute URL if (u->host = extract_match(pm[R_HOST],url)) { tmp = extract_match(pm[R_PORT],url); if (tmp) { u->port = atoi(tmp); // found a port number } else { u->port=80; // no port - default is 80 } } else { u->host = strdup(base->host); u->port = base->port; // u->port=0; // no host - set port to NULL (0) } u->uri=extract_match(pm[R_URI],url); // grab the URI } else { // no protocol - must be a relative URL. // use the base host and port and set protocol to http if (base==NULL) { // invalid hyperlink - just return null printf("INVALID LINK\n"); urlrec_cleanup(u); return(NULL); } u->protocol = strdup("http"); u->host = strdup(base->host); u->port = base->port; // get the uri part tmp = extract_match(pm[R_URI],url); if (*tmp=='/') { // leading slash - don't add base uri u->uri = tmp; } else { // relative uri - need to prepend the base uri // after taking off the last component of the base uri // (if it doesn't end with /) // also need to watch for '#' links! u->uri = malloc(strlen(base->uri)+strlen(tmp)+1); strcpy(u->uri,base->uri); len = strlen(u->uri); if (tmp[0]!='#') { while ((len>0) && (u->uri[len] != '/')) { u->uri[len]=0; len--; } if (len==0) u->uri[0] = '/'; } strcat(u->uri,tmp); } } return(u); } // Extract just the target URL from an A tag, return null if // no valid target was found. This function assumes the input string // is a complete A tag including the < and >. // allocates memory for the return value, so the caller is responsible // for freeing the buffer (use free()). char *extract_url(char *atag) { const char *hrefregex="HREF[ \t\r\n]*=[ \t\r\n]*\"*([^>\"]+)"; static int firsttime=1; static regex_t reg; regmatch_t pm[2]; int len; char *tmp; // compile the regular expression if this is the first time called if (firsttime) { if ( regcomp(®,hrefregex,REG_EXTENDED|REG_ICASE) ) { fprintf(stderr,"Error compiling regular expression for extract_url\n"); exit(1); } firsttime=0; } // Call regexec to find what part of the string matches the regular expression if (regexec(®,atag,2,pm,REG_NOTBOL)!=0) { // no match found - just return null return(NULL); } // we need the first subexpression, so look in pm[1] if (pm[1].rm_so==-1) { // we didn't get a match return(NULL); } // get the length of the match len = pm[1].rm_eo - pm[1].rm_so; tmp = malloc(len+1); if (!tmp) { // memory error fprintf(stderr,"Memory allocation error\n"); exit(1); } strncpy(tmp,atag+pm[1].rm_so, len); tmp[len]=0; return(tmp); } // ------------------------------------------------------------- // data structure used to return a list of hyperlinks extracted // from an HTML document. typedef struct linkrec { char *url; // the url itself (target of the link) struct linkrec *next; // pointer to next record } linkrec; // lnkcnt, inc, dec, report are just used for debugging (to make sure // all allocated linkrecs and urlrecs are released with free). int lnkcnt=0; void lnkinc(void) { lnkcnt++; } void lnkdec(void) { lnkcnt--; } void report(void) { printf("URLS: %d\n",urlrecs); printf("LRECS: %d\n",lnkcnt); } // extract_links returns a linked list of the hyperlink targets // (URLs ) found in an HTML document. The linked list is null terminated. // // everything is dynamically allocated, so you need to free // everything when you are done with it. linkrec *extract_links(char *buf) { static int firsttime=1; const char *linkregex="<[aA][ \t\n\r][^>]*>"; static regex_t reg; regmatch_t pm; char *p; char tmp[1000]; linkrec *l,*head; int nlinks=0; // only compile the regular expression the first time we use it if (firsttime) { if ( regcomp(®,linkregex,REG_EXTENDED|REG_ICASE)) { fprintf(stderr,"Error - invalid regexp\n"); exit(1); } firsttime=0; } // initialize the links array to be of size 1. This allows // us to store the terminating null pointer (that tells the // caller when it's hit the end of the array). l=head=NULL; pm.rm_eo = pm.rm_so=0; while (regexec(®,buf,1,&pm,REG_NOTBOL)==0) { if (pm.rm_eo-pm.rm_so>1000) { fprintf(stderr,"Error - stumbled across a link to big to handle\n"); exit(1); } strncpy(tmp,buf+pm.rm_so,pm.rm_eo-pm.rm_so); tmp[pm.rm_eo-pm.rm_so]=0; buf+=pm.rm_eo; // got one A tag - extract the target p = extract_url(tmp); // if we really found a URL - add it to the list if (p!=NULL) { if (head==NULL) { head=l=malloc(sizeof(linkrec)); lnkinc(); if (!head) { fprintf(stderr,"Error - ran out of memory\n"); exit(1); } head->next=NULL; head->url=p; } else { l->next = malloc(sizeof(linkrec)); lnkinc(); if (l->next==NULL) { fprintf(stderr,"Error - ran out of memory\n"); exit(1); } l = l->next; l->next=NULL; l->url = p; } } } return(head); } /* // Sample main (used only to test the above code). // This main expects a single command line argument specifying // the name of a file containing HTML. The main reads in the file // and extracts (and prints out) all the hyperlink targets in the file. // a function that reads in a complete file and returns it // as one big long string. Not that efficient (just calls // realloc over and over...) // char *getfile(char *name) { int fd; char *buf; char tmp[1000]; int size=0; int n; fd = open(name,O_RDONLY); if (fd<0) { fprintf(stderr,"Error opening file\n"); exit(1); } buf = malloc(1001); while ((n=read(fd,buf+size,1000))>0) { size+=n; buf=realloc(buf,size+1001); if (buf==NULL) { printf("Memory Error\n"); exit(1); } } buf[size]=0; return(buf); } /* // Sample main that can be used to test the parsing code. int main(int argc, char **argv) { linkrec *lr,*tmp; char *p; int i; urlrec *u; urlrec *dmy; if (argc!=2) { fprintf(stderr,"Error - supply a filename\n"); exit(1); } // read in the entire file p = getfile(argv[1]); // extract hyperlinks from the file lr = extract_links(p); // use a phoney URL as the base (just to make sure the // resolution of relative URLs works) dmy = parse_url("http://dmy.com/",NULL); printf("Here are the links:\n"); while (lr) { printf("%s\n",lr->url); u = parse_url(lr->url,dmy); if (u->protocol) { printf("\tPROTOCOL %s\n",u->protocol); if (u->host) { printf("\tHOST:%s\n\tPORT: %d\n",u->host,u->port); } } printf("\tURI:%s\n",u->uri); // need to free stuff in the urlrec urlrec_cleanup(u); // and the linked list element free(lr->url); tmp=lr; lr = lr->next; free(tmp); } } */