/*RTF2HTML.c, Chuck Shotton - 6/21/93 */ /************************************************************************ * This program takes a stab at converting RTF (Rich Text Format) files * into HTML. There are some limitations that keep RTF from being able to * easily represent things like in-line images and anchors as styles. In * particular, RTF styles apply to entire "paragraphs", so anchors or * images in the middle of a text stream can't easily be represented by * styles. The intent is to ultimately use something like embedded text * color changes to represent these constructs. * * In the meantime, you can take existing Word documents, apply the * correct style sheet, and convert them to HTML with this tool. * * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, * cshotton@oac.hsc.uth.tmc.edu * * USAGE: rtf2html [rtf_filename] * * BEHAVIOR: * rtf2html will open the specified RTF input file or read from * standard input, writing converted HTML to standard output. * * NOTES: * The RTF document must be formatted with a style sheet that has * style numberings that conform to the style_mappings table * defined in this source file. * * MODIFICATIONS: * 6/21/93 : Chuck Shotton - created version 1.0. * ************************************************************************/ /* Note, the source is formated with 4 character tabs */ #include <stdio.h> #include <string.h> #ifdef THINK_C #include <console.h> #endif #ifndef TRUE #define TRUE -1 #define FALSE 0 #endif #define MAX_LEVELS 20 /*defines the # of nested in-line styles (pairs of {})*/ #define MAX_STYLES 12 #define MAX_INLINE_STYLES 4 /*defines # of in-line styles, bold, italic, etc.*/ typedef enum {s_plain, s_bold, s_italic, s_underline, /*in-line styles*/ s_para, /*pseudo style*/ s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/ } StyleState; char *styles[MAX_STYLES][2] = { /*HTML Start and end tags for styles*/ {"", ""}, {"<b>", "</b>"}, {"<i>", "</i>"}, {"<em>", "</em>"}, {"<p>", ""}, {"", ""}, {"<h1>", "</h1>"}, {"<h2>", "</h2>"}, {"<h3>", "</h3>"}, {"<h4>", "</h4>"}, {"<h5>", "</h5>"}, {"<h6>", "</h6>"} }; /* style_mappings maps the style numbers in a RTF style sheet into one of the*/ /* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*/ /* Additional styles for lists, etc. should be added here. Style info */ /* ultimately should be read from some sort of config file into these tables.*/ char *style_mappings[7] = { "", "255", "254", "253", "252", "251", "250" }; /* RTF tokens that mean something to the parser. All others are ignored. */ typedef enum {t_start,t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_u, t_i, t_plain, t_par, t_end} TokenIndex; char *tokens[] = { "###", "fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "i", "plain", "par", "###" }; char style_state[MAX_LEVELS][MAX_INLINE_STYLES], curr_style[MAX_INLINE_STYLES]; short curr_heading; short level, /*current {} nesting level*/ skip_to_level,/*{} level to which parsing should skip (used to skip */ /* font tables, style sheets, color tables, etc.) */ gobble, /*Flag set to indicate all input should be discarded */ ignore_styles;/*Set to ignore inline style expansions after style use*/ /**************************************/ char RTF_GetChar(f) FILE *f; { return fgetc(f); } /**************************************/ void RTF_PutStr(s) char *s; { if (gobble) return; fputs(s, stdout); } /**************************************/ void RTF_PutChar(ch) char ch; { if (gobble) return; switch (ch) { case '<': RTF_PutStr("<"); break; case '>': RTF_PutStr(">"); break; case '&': RTF_PutStr("&"); break; default: fputc(ch, stdout); } } /**************************************/ void RTF_PlainStyle (s) char *s; { int j; for (j=0;j<MAX_INLINE_STYLES;j++) s[j] = (char) 0; } /**************************************/ void RTF_CopyStyle (s, d) char *s, *d; { int j; for (j=0;j<MAX_INLINE_STYLES;j++) d[j] = s[j]; } /**************************************/ void RTF_PushState(level) short *level; { RTF_CopyStyle (curr_style, style_state[*level]); (*level)++; } /**************************************/ void RTF_PopState(level) short *level; { int j; /*close off any in-line styles*/ for (j=0;j<MAX_INLINE_STYLES;j++) { if (curr_style[j]) RTF_PutStr(styles[j][1]); } (*level)--; RTF_CopyStyle (style_state[*level], curr_style); if (*level == skip_to_level) { skip_to_level = -1; gobble = FALSE; } } /**************************************/ void RTF_Title(s) char *s; { fprintf (stdout, "<title>%s</title>", s); } /**************************************/ void RTF_BuildToken (token, ch) char *token; char ch; { strncat (token, &ch, 1); } /**************************************/ /* Map a style number into a HTML heading */ short RTF_MapStyle(s) char *s; { int i; for (i=0;i<7;i++) if (!strcmp(style_mappings[i], s)) return (i); return (0); } /**************************************/ /* Perform actions for RTF control words */ void RTF_DoControl (control, arg) char *control, *arg; { TokenIndex i; short style; if (gobble) return; for (i=t_start; i<t_end; i++) { if (!strcmp(control, tokens[i])) break; } switch (i) { case t_fonttbl: /*skip all of these and their contents!*/ case t_colortbl: case t_stylesheet: case t_info: gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ skip_to_level = level-1; break; case t_s: /*Style*/ style = RTF_MapStyle (arg); curr_heading = s_h0 + style; RTF_PutStr(styles[curr_heading][0]); ignore_styles = TRUE; break; case t_b: /*Bold*/ if (!ignore_styles) { RTF_PutStr(styles[s_bold][0]); curr_style[s_bold] = TRUE; } break; case t_u: /*Underline, maps to "emphasis" HTML style*/ if (!ignore_styles) { RTF_PutStr(styles[s_underline][0]); curr_style[s_underline] = TRUE; } break; case t_i: /*Italic*/ if (!ignore_styles) { RTF_PutStr(styles[s_italic][0]); curr_style[s_italic] = TRUE; } break; case t_par: /*Paragraph*/ if (curr_heading) { RTF_PutStr(styles[curr_heading][1]); curr_heading = s_plain; } else { RTF_PutStr(styles[s_para][0]); } ignore_styles = FALSE; break; case t_plain: /*reset inline styles*/ RTF_PlainStyle(curr_style); break; } } /**************************************/ /* RTF_Parse is a crude, ugly state machine that understands enough of */ /* the RTF syntax to be dangerous. */ typedef enum {plaintext, control, argument, backslash} ParseState; int RTF_Parse (filename) char *filename; { FILE *f; char ch; ParseState state; char token[40], arg[40]; if (filename) { if (!(f = fopen (filename, "r"))) { fprintf (stderr, "\nError: Input file %s not found.\n", filename); return (-1); } RTF_Title(filename); } else { f = stdin; RTF_Title("STDIN"); } state = plaintext; level = 0; skip_to_level = -1; gobble = FALSE; ignore_styles = FALSE; while (!feof(f)) { /*get a character*/ ch = RTF_GetChar(f); switch (state) { case plaintext: /*this is just normal user content*/ switch (ch) { case '\\': state = backslash; break; case '{': RTF_PushState(&level); break; case '}': RTF_PopState(&level); break; default: RTF_PutChar(ch); break; } break; case backslash: /*something special like a command or escape*/ switch (ch) { case '\\': case '{': case '}': RTF_PutChar(ch); state = plaintext; break; default: if (isalpha(ch)) { state = control; token[0]='\0'; RTF_BuildToken(token, ch); } else { fprintf(stderr, "\nRTF Error: unexpected '%c' after \\.\n", ch); } break; } break; case control: /*collecting the command token*/ if (isalpha(ch)) { RTF_BuildToken(token, ch); } else if (isdigit(ch)) { state = argument; arg[0]='\0'; RTF_BuildToken(arg, ch); } else { RTF_DoControl (token, ""); state = plaintext; switch (ch) { case '\\': state = backslash; break; case '{': RTF_PushState(&level); break; case '}': RTF_PopState(&level); break; default: if (!isspace(ch)) RTF_PutChar(ch); break; } } break; case argument: /*collecting the optional command argument*/ if (isdigit(ch)) { RTF_BuildToken(arg, ch); } else { state = plaintext; RTF_DoControl (token, arg); switch (ch) { case '\\': state = backslash; break; case '{': RTF_PushState(&level); break; case '}': RTF_PopState(&level); break; default: if (!isspace(ch)) RTF_PutChar(ch); break; } } break; }/*switch*/ }/*while*/ fclose (f); } /**************************************/ void Initialize() { int i,j; for (i=0;i<MAX_LEVELS;i++) for (j=0;j<MAX_STYLES;j++) RTF_PlainStyle(style_state[i]); RTF_PlainStyle(curr_style); curr_heading = s_plain; } /**************************************/ main(argc, argv) int argc; char **argv; { #ifdef THINK_C argc = ccommand (&argv); #endif Initialize(); if (argc>1) return (RTF_Parse(argv[1])); else return (RTF_Parse(NULL)); }