blob: 329f5ae8d401517ee3ef276eeb0f2e71ef298d40 [file] [log] [blame]
Eric Andersen6b6b3f61999-10-28 16:06:25 +00001/*
Mark Whitley6315ce62000-07-10 22:55:51 +00002 * sed.c - very minimalist version of sed
Eric Andersen6b6b3f61999-10-28 16:06:25 +00003 *
Erik Andersen61677fe2000-04-13 01:18:56 +00004 * Copyright (C) 1999,2000 by Lineo, inc.
Mark Whitley6315ce62000-07-10 22:55:51 +00005 * Written by Mark Whitley <markw@lineo.com>, <markw@enol.com>
Erik Andersen1266a131999-12-29 22:19:46 +00006 *
Eric Andersen6b6b3f61999-10-28 16:06:25 +00007 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
Mark Whitley6315ce62000-07-10 22:55:51 +000023/*
24 Supported features and commands in this version of sed:
25
26 - comments ('#')
27 - Address matching: num|/matchstr/[,num|/matchstr/|$]command
28 - Commands: p, d, s/match/replace/[g]
29
30 (Note: Specifying an address (range) to match is *optional*; commands
31 default to the whole pattern space if no specific address match was
32 requested.)
33
34 Unsupported features:
35
36 - transliteration (y/source-chars/dest-chars/) (use 'tr')
37 - no support for characters other than the '/' character for regex matches
38 - no pattern space hold space storing / swapping (x, etc.)
39 - no labels / branching (: label, b, t, and friends)
40 - and lots, lots more.
41
42*/
43
Eric Andersen6b6b3f61999-10-28 16:06:25 +000044#include <stdio.h>
Mark Whitley6315ce62000-07-10 22:55:51 +000045#include <stdlib.h> /* for realloc() */
46#include <unistd.h> /* for getopt() */
47#include <regex.h>
48#include <string.h> /* for strdup() */
Eric Andersen6b6b3f61999-10-28 16:06:25 +000049#include <errno.h>
Mark Whitley6315ce62000-07-10 22:55:51 +000050#include <ctype.h> /* for isspace() */
51#include "internal.h"
52
53
54/* externs */
55extern int optind; /* in unistd.h */
56extern char *optarg; /* ditto */
57
58/* options */
59static int be_quiet = 0;
60
61struct sed_cmd {
62
63 /* address storage */
64 int beg_line; /* 'sed 1p' 0 == no begining line, apply commands to all lines */
65 int end_line; /* 'sed 1,3p' 0 == no end line, use only beginning. -1 == $ */
66 regex_t *beg_match; /* sed -e '/match/cmd' */
67 regex_t *end_match; /* sed -e '/match/,/end_match/cmd' */
68
69 /* the command */
70 char cmd; /* p,d,s (add more at your leisure :-) */
71
72 /* substitution command specific fields */
73 regex_t *sub_match; /* sed -e 's/sub_match/replace/' */
74 char *replace; /* sed -e 's/sub_match/replace/' XXX: who will hold the \1 \2 \3s? */
75 unsigned int sub_g:1; /* sed -e 's/foo/bar/g' (global) */
76};
77
78/* globals */
79static struct sed_cmd *sed_cmds = NULL; /* growable arrary holding a sequence of sed cmds */
80static int ncmds = 0; /* number of sed commands */
81
82/*static char *cur_file = NULL;*/ /* file currently being processed XXX: do I need this? */
Eric Andersen6b6b3f61999-10-28 16:06:25 +000083
Erik Andersen1266a131999-12-29 22:19:46 +000084static const char sed_usage[] =
Mark Whitley6315ce62000-07-10 22:55:51 +000085 "sed [-Vhnef] pattern [files...]\n"
Erik Andersen7ab9c7e2000-05-12 19:41:47 +000086#ifndef BB_FEATURE_TRIVIAL_HELP
Mark Whitley6315ce62000-07-10 22:55:51 +000087 "\n"
88 "-n\tsuppress automatic printing of pattern space\n"
89 "-e script\tadd the script to the commands to be executed\n"
90 "-f scriptfile\tadd the contents of script-file to the commands to be executed\n"
91 "-h\tdisplay this help message\n"
92 "-V\toutput version information and exit\n"
93 "\n"
94 "If no -e or -f is given, the first non-option argument is taken as the\n"
95 "sed script to interpret. All remaining arguments are names of input\n"
96 "files; if no input files are specified, then the standard input is read.\n"
Erik Andersen7ab9c7e2000-05-12 19:41:47 +000097#endif
98 ;
Eric Andersen6b6b3f61999-10-28 16:06:25 +000099
Mark Whitley6315ce62000-07-10 22:55:51 +0000100#if 0
101/* Nuke from here { */
Erik Andersen1266a131999-12-29 22:19:46 +0000102
Erik Andersen1266a131999-12-29 22:19:46 +0000103
Mark Whitley6315ce62000-07-10 22:55:51 +0000104/* get_line_from_file() - This function reads an entire line from a text file
105 * * up to a newline. It returns a malloc'ed char * which must be stored and
106 * * free'ed by the caller. */
107extern char *get_line_from_file(FILE *file)
Eric Andersen50d63601999-11-09 01:47:36 +0000108{
Mark Whitley6315ce62000-07-10 22:55:51 +0000109 static const int GROWBY = 80; /* how large we will grow strings by */
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000110
Mark Whitley6315ce62000-07-10 22:55:51 +0000111 int ch;
112 int idx = 0;
113 char *linebuf = NULL;
114 int linebufsz = 0;
Erik Andersene49d5ec2000-02-08 19:58:47 +0000115
Mark Whitley6315ce62000-07-10 22:55:51 +0000116 while (1) {
117 ch = fgetc(file);
118 if (ch == EOF)
119 break;
120 /* grow the line buffer as necessary */
121 if (idx > linebufsz-2)
122 linebuf = realloc(linebuf, linebufsz += GROWBY);
123 linebuf[idx++] = (char)ch;
124 if ((char)ch == '\n')
125 break;
Erik Andersene49d5ec2000-02-08 19:58:47 +0000126 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000127
128 if (idx == 0)
129 return NULL;
130
131 linebuf[idx] = 0;
132 return linebuf;
Erik Andersen1266a131999-12-29 22:19:46 +0000133}
134
Mark Whitley6315ce62000-07-10 22:55:51 +0000135static void usage(const char *string)
Erik Andersen1266a131999-12-29 22:19:46 +0000136{
Mark Whitley6315ce62000-07-10 22:55:51 +0000137 printf("usage: %s\n", string);
138 exit(0);
139}
Erik Andersen1266a131999-12-29 22:19:46 +0000140
Mark Whitley6315ce62000-07-10 22:55:51 +0000141/* } to here when we integrate this into busybox */
142#endif
143
144static void destroy_cmd_strs()
145{
146 if (sed_cmds == NULL)
147 return;
148
149 /* destroy all the elements in the array */
150 while (--ncmds >= 0) {
151
152 if (sed_cmds[ncmds].beg_match) {
153 regfree(sed_cmds[ncmds].beg_match);
154 free(sed_cmds[ncmds].beg_match);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000155 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000156 if (sed_cmds[ncmds].end_match) {
157 regfree(sed_cmds[ncmds].end_match);
158 free(sed_cmds[ncmds].end_match);
159 }
160 if (sed_cmds[ncmds].sub_match) {
161 regfree(sed_cmds[ncmds].sub_match);
162 free(sed_cmds[ncmds].sub_match);
163 }
164 if (sed_cmds[ncmds].replace)
165 free(sed_cmds[ncmds].replace);
166 }
Erik Andersene49d5ec2000-02-08 19:58:47 +0000167
Mark Whitley6315ce62000-07-10 22:55:51 +0000168 /* destroy the array */
169 free(sed_cmds);
170 sed_cmds = NULL;
171}
172
173static void exit_sed(int retcode, const char *message)
174{
175 destroy_cmd_strs();
176 if (message)
177 fputs(message, stderr);
178 exit(retcode);
179}
180
181/*
182 * trim_str - trims leading and trailing space from a string
183 *
184 * Note: This returns a malloc'ed string so you must store and free it
185 * XXX: This should be in the utility.c file.
186 */
187static char *trim_str(const char *str)
188{
189 int i;
190 char *retstr = strdup(str);
191
192 /* trim leading whitespace */
193 memmove(retstr, &retstr[strspn(retstr, " \n\t\v")], strlen(retstr));
194
195 /* trim trailing whitespace */
196 i = strlen(retstr) - 1;
197 while (isspace(retstr[i]))
198 i--;
199 retstr[++i] = 0;
200
201 /* Aside:
202 *
203 * you know, a strrspn() would really be nice cuz then we could say:
204 *
205 * retstr[strlen(retstr) - strrspn(retstr, " \n\t\v") + 1] = 0;
206 */
207
208 return retstr;
209}
210
211/*
212 * index_of_unescaped_slash - walks left to right through a string beginning
213 * at a specified index and returns the index of the next unescaped slash.
214 */
215static int index_of_next_unescaped_slash(int idx, const char *str)
216{
217 do {
218 idx++;
219 /* test if we've hit the end */
220 if (str[idx] == 0)
221 return -1;
222 } while (str[idx] != '/' && str[idx - 1] != '\\');
223
224 return idx;
225}
226
227/*
228 * returns the index in the string just past where the address ends.
229 */
230static int get_address(const char *str, int *line, regex_t **regex)
231{
232 char *my_str = strdup(str);
233 int idx = 0;
234
235 if (isdigit(my_str[idx])) {
236 do {
237 idx++;
238 } while (isdigit(my_str[idx]));
239 my_str[idx] = 0;
240 *line = atoi(my_str);
241 *regex = NULL;
242 }
243 else if (my_str[idx] == '$') {
244 *line = -1;
245 *regex = NULL;
246 idx++;
247 }
248 else if (my_str[idx] == '/') {
249 int ret;
250 idx = index_of_next_unescaped_slash(idx, my_str);
251 if (idx == -1) {
252 free(my_str);
253 exit_sed(1, "sed: unterminated match expression\n");
254 }
255 my_str[idx] = 0; /* shave off the trailing '/' */
256 my_str++; /* shave off the leading '/' */
257 *regex = (regex_t *)malloc(sizeof(regex_t));
258 if ((ret = regcomp(*regex, my_str, 0)) != 0) {
259 /* error handling if regular expression couldn't be compiled */
260 int errmsgsz = regerror(ret, *regex, NULL, 0);
261 char *errmsg = malloc(errmsgsz);
262 if (errmsg == NULL) {
263 exit_sed(1, "sed: memory error\n");
Erik Andersene49d5ec2000-02-08 19:58:47 +0000264 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000265 regerror(ret, *regex, errmsg, errmsgsz);
266 fprintf(stderr, "sed: %s\n", errmsg);
267 free(errmsg);
268 exit_sed(1, NULL);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000269 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000270 my_str--; /* move my_str back so free() (below) won't barf */
271 idx++; /* advance idx one past the end of the /match/ */
272 }
273 else {
274 fprintf(stderr, "sed.c:get_address: no address found in string\n");
275 fprintf(stderr, "\t(you probably didn't check the string you passed me)\n");
276 idx = -1;
277 }
Erik Andersene49d5ec2000-02-08 19:58:47 +0000278
Mark Whitley6315ce62000-07-10 22:55:51 +0000279 free(my_str);
280 return idx;
281}
Erik Andersene49d5ec2000-02-08 19:58:47 +0000282
Mark Whitley6315ce62000-07-10 22:55:51 +0000283static void parse_cmd_str(struct sed_cmd *sed_cmd, const char *cmdstr)
284{
285 int idx = 0;
286
287 /* parse the command
288 * format is: [addr][,addr]cmd
289 * |----||-----||-|
290 * part1 part2 part3
291 */
292
293 /* first part (if present) is an address: either a number or a /regex/ */
294 if (isdigit(cmdstr[idx]) || cmdstr[idx] == '/')
295 idx = get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
296
297 /* second part (if present) will begin with a comma */
298 if (cmdstr[idx] == ',')
299 idx += get_address(&cmdstr[++idx], &sed_cmd->end_line, &sed_cmd->end_match);
300
301 /* last part (mandatory) will be a command */
302 if (cmdstr[idx] == '\0')
303 exit_sed(1, "sed: missing command\n");
304 if (!strchr("pds", cmdstr[idx])) /* <-- XXX add new commands here */
305 exit_sed(1, "sed: invalid command\n");
306 sed_cmd->cmd = cmdstr[idx];
307 /* special-case handling for 's' */
308 if (sed_cmd->cmd == 's') {
309 int oldidx;
310 /* format for substitution is:
311 * s/match/replace/g
312 * | |
313 * mandatory optional
314 */
315
316 /* verify that we have an 's' followed by a 'slash' */
317 if (cmdstr[++idx] != '/')
318 exit_sed(1, "sed: bad format in substitution expression\n");
319
320 /* get the substitution part */
321 idx += get_address(&cmdstr[idx], NULL, &sed_cmd->sub_match);
322
323 /* get the replacement part */
324 oldidx = idx;
325 idx = index_of_next_unescaped_slash(idx, cmdstr);
326 sed_cmd->replace = (char *)malloc(idx - oldidx + 1);
327 strncpy(sed_cmd->replace, &cmdstr[oldidx], idx - oldidx);
328 sed_cmd->replace[idx - oldidx] = 0;
329
330 /* store the 'g' if present */
331 if (cmdstr[++idx] == 'g')
332 sed_cmd->sub_g = 1;
Erik Andersen1266a131999-12-29 22:19:46 +0000333 }
Eric Andersen50d63601999-11-09 01:47:36 +0000334}
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000335
Mark Whitley6315ce62000-07-10 22:55:51 +0000336static void add_cmd_str(const char *cmdstr)
Erik Andersen1266a131999-12-29 22:19:46 +0000337{
Mark Whitley6315ce62000-07-10 22:55:51 +0000338 char *my_cmdstr = trim_str(cmdstr);
Erik Andersen1266a131999-12-29 22:19:46 +0000339
Mark Whitley6315ce62000-07-10 22:55:51 +0000340 /* if this is a comment, don't even bother */
341 if (my_cmdstr[0] == '#') {
342 free(my_cmdstr);
343 return;
344 }
345
346 /* grow the array */
347 sed_cmds = realloc(sed_cmds, sizeof(struct sed_cmd) * (++ncmds));
348 /* zero new element */
349 memset(&sed_cmds[ncmds-1], 0, sizeof(struct sed_cmd));
350 /* load command string into new array element */
351 parse_cmd_str(&sed_cmds[ncmds-1], my_cmdstr);
352}
353
354
355static void load_cmd_file(char *filename)
356{
357 FILE *cmdfile;
358 char *line;
359
360 cmdfile = fopen(filename, "r");
361 if (cmdfile == NULL)
362 exit_sed(1, strerror(errno));
363
364 while ((line = get_line_from_file(cmdfile)) != NULL) {
365 line[strlen(line)-1] = 0; /* eat newline */
366 add_cmd_str(line);
367 free(line);
368 }
369}
370
371
372static int do_sed_command(const struct sed_cmd *sed_cmd, const char *line)
373{
374 int altered = 0;
375
376 switch (sed_cmd->cmd) {
377
378 case 'p':
379 fputs(line, stdout);
380 break;
381
382 case 'd':
383 altered++;
384 break;
385
386 case 's': /* oo, a fun one :-) */
387
388 /* we only substitute if the substitution 'search' expression matches */
389 if (regexec(sed_cmd->sub_match, line, 0, NULL, 0) == 0) {
390 regmatch_t regmatch;
391 int i;
392 char *ptr = (char *)line;
393
394 while (*ptr) {
395 /* if we can match the search string... */
396 if (regexec(sed_cmd->sub_match, ptr, 1, &regmatch, 0) == 0) {
397 /* print everything before the match, */
398 for (i = 0; i < regmatch.rm_so; i++)
399 fputc(ptr[i], stdout);
400 /* then print the substitution in its place */
401 fputs(sed_cmd->replace, stdout);
402 /* then advance past the match */
403 ptr += regmatch.rm_eo;
404 /* and let the calling function know that something
405 * has been changed */
406 altered++;
407
408 /* if we're not doing this globally... */
409 if (!sed_cmd->sub_g)
410 break;
411 }
412 /* if we COULD NOT match the search string (meaning we've gone past
413 * all previous instances), get out */
414 else
415 break;
416 }
417
418 /* is there anything left to print? */
419 if (*ptr)
420 fputs(ptr, stdout);
421 }
422
423 break;
424 }
425
426 return altered;
427}
428
429static void process_file(FILE *file)
430{
431 char *line = NULL;
432 static int linenum = 0; /* GNU sed does not restart counting lines at EOF */
433 unsigned int still_in_range = 0;
434 int line_altered;
435 int i;
436
437 /* go through every line in the file */
438 while ((line = get_line_from_file(file)) != NULL) {
439
440 linenum++;
441 line_altered = 0;
442
443 /* for every line, go through all the commands */
444 for (i = 0; i < ncmds; i++) {
445
446 /* are we acting on a range of matched lines? */
447 if (sed_cmds[i].beg_match && sed_cmds[i].end_match) {
448 if (still_in_range || regexec(sed_cmds[i].beg_match, line, 0, NULL, 0) == 0) {
449 line_altered += do_sed_command(&sed_cmds[i], line);
450 still_in_range = 1;
451 if (regexec(sed_cmds[i].end_match, line, 0, NULL, 0) == 0)
452 still_in_range = 0;
453 }
454 }
455
456 /* are we trying to match a single line? */
457 else if (sed_cmds[i].beg_match) {
458 if (regexec(sed_cmds[i].beg_match, line, 0, NULL, 0) == 0)
459 line_altered += do_sed_command(&sed_cmds[i], line);
460 }
461
462 /* are we acting on a range of line numbers? */
463 else if (sed_cmds[i].beg_line > 0 && sed_cmds[i].end_line > 0) {
464 if (linenum >= sed_cmds[i].beg_line && linenum <= sed_cmds[i].end_line)
465 line_altered += do_sed_command(&sed_cmds[i], line);
466 }
467
468 /* are we acting on a specified line number */
469 else if (sed_cmds[i].beg_line > 0) {
470 if (linenum == sed_cmds[i].beg_line)
471 line_altered += do_sed_command(&sed_cmds[i], line);
472 }
473
474 /* not acting on matches or line numbers. act on every line */
475 else
476 line_altered += do_sed_command(&sed_cmds[i], line);
477
Erik Andersene49d5ec2000-02-08 19:58:47 +0000478 }
Erik Andersen1266a131999-12-29 22:19:46 +0000479
Mark Whitley6315ce62000-07-10 22:55:51 +0000480 /* we will print the line unless we were told to be quiet or if the
481 * line was altered (via a 'd'elete or 's'ubstitution) */
482 if (!be_quiet && !line_altered)
483 fputs(line, stdout);
484
485 free(line);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000486 }
Erik Andersen1266a131999-12-29 22:19:46 +0000487}
488
489extern int sed_main(int argc, char **argv)
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000490{
Mark Whitley6315ce62000-07-10 22:55:51 +0000491 int opt;
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000492
Mark Whitley6315ce62000-07-10 22:55:51 +0000493 /* do special-case option parsing */
494 if (argv[1] && (strcmp(argv[1], "--help") == 0))
Eric Andersenc1525e81999-10-29 00:07:31 +0000495 usage(sed_usage);
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000496
Mark Whitley6315ce62000-07-10 22:55:51 +0000497 /* do normal option parsing */
498 while ((opt = getopt(argc, argv, "Vhne:f:")) > 0) {
499 switch (opt) {
500 case 'V':
501 printf("Print Busybox version here\n");
502 exit(0);
503 break;
504 case 'h':
505 usage(sed_usage);
506 break;
Erik Andersene916d242000-03-06 19:20:35 +0000507 case 'n':
Mark Whitley6315ce62000-07-10 22:55:51 +0000508 be_quiet++;
Erik Andersene916d242000-03-06 19:20:35 +0000509 break;
510 case 'e':
Mark Whitley6315ce62000-07-10 22:55:51 +0000511 add_cmd_str(optarg);
Erik Andersene916d242000-03-06 19:20:35 +0000512 break;
Mark Whitley6315ce62000-07-10 22:55:51 +0000513 case 'f':
514 load_cmd_file(optarg);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000515 break;
Erik Andersene49d5ec2000-02-08 19:58:47 +0000516 }
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000517 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000518
519 /* if we didn't get a pattern from a -e and no command file was specified,
520 * argv[optind] should be the pattern. no pattern, no worky */
521 if (ncmds == 0) {
522 if (argv[optind] == NULL)
523 usage(sed_usage);
524 else {
525 add_cmd_str(argv[optind]);
526 optind++;
527 }
528 }
529
530
531 /* argv[(optind)..(argc-1)] should be names of file to process. If no
532 * files were specified or '-' was specified, take input from stdin.
533 * Otherwise, we process all the files specified. */
534 if (argv[optind] == NULL || (strcmp(argv[optind], "-") == 0)) {
535 process_file(stdin);
536 }
537 else {
538 int i;
539 FILE *file;
540 for (i = optind; i < argc; i++) {
541 file = fopen(argv[i], "r");
542 if (file == NULL) {
543 fprintf(stderr, "sed: %s: %s\n", argv[i], strerror(errno));
544 } else {
545 process_file(file);
546 fclose(file);
547 }
548 }
549 }
550
551 exit_sed(0, NULL);
552
553 /* not reached */
554 return 0;
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000555}
556
Mark Whitley6315ce62000-07-10 22:55:51 +0000557#ifdef TEST_SED
558int main(int argc, char **argv)
559{
560 return sed_main(argc, argv);
561}
562#endif