blob: 2fb243fb94ae4e09861b02aef6d868ed010a292e [file] [log] [blame]
Eric Andersen6b6b3f61999-10-28 16:06:25 +00001/*
Mark Whitley6315ce62000-07-10 22:55:51 +00002 * sed.c - very minimalist version of sed
Eric Andersen6b6b3f61999-10-28 16:06:25 +00003 *
Erik Andersen61677fe2000-04-13 01:18:56 +00004 * Copyright (C) 1999,2000 by Lineo, inc.
Mark Whitley6315ce62000-07-10 22:55:51 +00005 * Written by Mark Whitley <markw@lineo.com>, <markw@enol.com>
Erik Andersen1266a131999-12-29 22:19:46 +00006 *
Eric Andersen6b6b3f61999-10-28 16:06:25 +00007 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
Mark Whitley6315ce62000-07-10 22:55:51 +000023/*
24 Supported features and commands in this version of sed:
25
26 - comments ('#')
27 - Address matching: num|/matchstr/[,num|/matchstr/|$]command
28 - Commands: p, d, s/match/replace/[g]
29
30 (Note: Specifying an address (range) to match is *optional*; commands
31 default to the whole pattern space if no specific address match was
32 requested.)
33
34 Unsupported features:
35
36 - transliteration (y/source-chars/dest-chars/) (use 'tr')
37 - no support for characters other than the '/' character for regex matches
38 - no pattern space hold space storing / swapping (x, etc.)
39 - no labels / branching (: label, b, t, and friends)
40 - and lots, lots more.
41
42*/
43
Eric Andersen6b6b3f61999-10-28 16:06:25 +000044#include <stdio.h>
Mark Whitley6315ce62000-07-10 22:55:51 +000045#include <stdlib.h> /* for realloc() */
46#include <unistd.h> /* for getopt() */
47#include <regex.h>
48#include <string.h> /* for strdup() */
Eric Andersen6b6b3f61999-10-28 16:06:25 +000049#include <errno.h>
Mark Whitley6315ce62000-07-10 22:55:51 +000050#include <ctype.h> /* for isspace() */
51#include "internal.h"
52
53
54/* externs */
55extern int optind; /* in unistd.h */
56extern char *optarg; /* ditto */
57
58/* options */
59static int be_quiet = 0;
60
61struct sed_cmd {
62
63 /* address storage */
64 int beg_line; /* 'sed 1p' 0 == no begining line, apply commands to all lines */
65 int end_line; /* 'sed 1,3p' 0 == no end line, use only beginning. -1 == $ */
66 regex_t *beg_match; /* sed -e '/match/cmd' */
67 regex_t *end_match; /* sed -e '/match/,/end_match/cmd' */
68
69 /* the command */
70 char cmd; /* p,d,s (add more at your leisure :-) */
71
72 /* substitution command specific fields */
73 regex_t *sub_match; /* sed -e 's/sub_match/replace/' */
74 char *replace; /* sed -e 's/sub_match/replace/' XXX: who will hold the \1 \2 \3s? */
75 unsigned int sub_g:1; /* sed -e 's/foo/bar/g' (global) */
76};
77
78/* globals */
79static struct sed_cmd *sed_cmds = NULL; /* growable arrary holding a sequence of sed cmds */
80static int ncmds = 0; /* number of sed commands */
81
82/*static char *cur_file = NULL;*/ /* file currently being processed XXX: do I need this? */
Eric Andersen6b6b3f61999-10-28 16:06:25 +000083
Erik Andersen1266a131999-12-29 22:19:46 +000084static const char sed_usage[] =
Mark Whitley6315ce62000-07-10 22:55:51 +000085 "sed [-Vhnef] pattern [files...]\n"
Erik Andersen7ab9c7e2000-05-12 19:41:47 +000086#ifndef BB_FEATURE_TRIVIAL_HELP
Mark Whitley6315ce62000-07-10 22:55:51 +000087 "\n"
88 "-n\tsuppress automatic printing of pattern space\n"
89 "-e script\tadd the script to the commands to be executed\n"
90 "-f scriptfile\tadd the contents of script-file to the commands to be executed\n"
91 "-h\tdisplay this help message\n"
92 "-V\toutput version information and exit\n"
93 "\n"
94 "If no -e or -f is given, the first non-option argument is taken as the\n"
95 "sed script to interpret. All remaining arguments are names of input\n"
96 "files; if no input files are specified, then the standard input is read.\n"
Erik Andersen7ab9c7e2000-05-12 19:41:47 +000097#endif
98 ;
Eric Andersen6b6b3f61999-10-28 16:06:25 +000099
Mark Whitley6315ce62000-07-10 22:55:51 +0000100static void destroy_cmd_strs()
101{
102 if (sed_cmds == NULL)
103 return;
104
105 /* destroy all the elements in the array */
106 while (--ncmds >= 0) {
107
108 if (sed_cmds[ncmds].beg_match) {
109 regfree(sed_cmds[ncmds].beg_match);
110 free(sed_cmds[ncmds].beg_match);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000111 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000112 if (sed_cmds[ncmds].end_match) {
113 regfree(sed_cmds[ncmds].end_match);
114 free(sed_cmds[ncmds].end_match);
115 }
116 if (sed_cmds[ncmds].sub_match) {
117 regfree(sed_cmds[ncmds].sub_match);
118 free(sed_cmds[ncmds].sub_match);
119 }
120 if (sed_cmds[ncmds].replace)
121 free(sed_cmds[ncmds].replace);
122 }
Erik Andersene49d5ec2000-02-08 19:58:47 +0000123
Mark Whitley6315ce62000-07-10 22:55:51 +0000124 /* destroy the array */
125 free(sed_cmds);
126 sed_cmds = NULL;
127}
128
129static void exit_sed(int retcode, const char *message)
130{
131 destroy_cmd_strs();
132 if (message)
133 fputs(message, stderr);
134 exit(retcode);
135}
136
137/*
138 * trim_str - trims leading and trailing space from a string
139 *
140 * Note: This returns a malloc'ed string so you must store and free it
141 * XXX: This should be in the utility.c file.
142 */
143static char *trim_str(const char *str)
144{
145 int i;
146 char *retstr = strdup(str);
147
148 /* trim leading whitespace */
149 memmove(retstr, &retstr[strspn(retstr, " \n\t\v")], strlen(retstr));
150
151 /* trim trailing whitespace */
152 i = strlen(retstr) - 1;
153 while (isspace(retstr[i]))
154 i--;
155 retstr[++i] = 0;
156
157 /* Aside:
158 *
159 * you know, a strrspn() would really be nice cuz then we could say:
160 *
161 * retstr[strlen(retstr) - strrspn(retstr, " \n\t\v") + 1] = 0;
162 */
163
164 return retstr;
165}
166
167/*
168 * index_of_unescaped_slash - walks left to right through a string beginning
169 * at a specified index and returns the index of the next unescaped slash.
170 */
171static int index_of_next_unescaped_slash(int idx, const char *str)
172{
173 do {
174 idx++;
175 /* test if we've hit the end */
176 if (str[idx] == 0)
177 return -1;
178 } while (str[idx] != '/' && str[idx - 1] != '\\');
179
180 return idx;
181}
182
183/*
184 * returns the index in the string just past where the address ends.
185 */
186static int get_address(const char *str, int *line, regex_t **regex)
187{
188 char *my_str = strdup(str);
189 int idx = 0;
190
191 if (isdigit(my_str[idx])) {
192 do {
193 idx++;
194 } while (isdigit(my_str[idx]));
195 my_str[idx] = 0;
196 *line = atoi(my_str);
197 *regex = NULL;
198 }
199 else if (my_str[idx] == '$') {
200 *line = -1;
201 *regex = NULL;
202 idx++;
203 }
204 else if (my_str[idx] == '/') {
Mark Whitley6315ce62000-07-10 22:55:51 +0000205 idx = index_of_next_unescaped_slash(idx, my_str);
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000206 if (idx == -1)
Mark Whitley6315ce62000-07-10 22:55:51 +0000207 exit_sed(1, "sed: unterminated match expression\n");
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000208 my_str[idx] = '\0';
209 *regex = (regex_t *)xmalloc(sizeof(regex_t));
210 if (bb_regcomp(*regex, my_str+1, REG_NEWLINE) != 0) {
211 free(my_str);
Mark Whitley6315ce62000-07-10 22:55:51 +0000212 exit_sed(1, NULL);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000213 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000214 }
215 else {
216 fprintf(stderr, "sed.c:get_address: no address found in string\n");
217 fprintf(stderr, "\t(you probably didn't check the string you passed me)\n");
218 idx = -1;
219 }
Erik Andersene49d5ec2000-02-08 19:58:47 +0000220
Mark Whitley6315ce62000-07-10 22:55:51 +0000221 free(my_str);
222 return idx;
223}
Erik Andersene49d5ec2000-02-08 19:58:47 +0000224
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000225static char *strdup_substr(const char *str, int start, int end)
226{
227 int size = end - start + 1;
228 char *newstr = xmalloc(size);
229 memcpy(newstr, str+start, size-1);
230 newstr[size-1] = '\0';
231 return newstr;
232}
233
Mark Whitley6315ce62000-07-10 22:55:51 +0000234static void parse_cmd_str(struct sed_cmd *sed_cmd, const char *cmdstr)
235{
236 int idx = 0;
237
238 /* parse the command
239 * format is: [addr][,addr]cmd
240 * |----||-----||-|
241 * part1 part2 part3
242 */
243
244 /* first part (if present) is an address: either a number or a /regex/ */
245 if (isdigit(cmdstr[idx]) || cmdstr[idx] == '/')
246 idx = get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
247
248 /* second part (if present) will begin with a comma */
249 if (cmdstr[idx] == ',')
250 idx += get_address(&cmdstr[++idx], &sed_cmd->end_line, &sed_cmd->end_match);
251
252 /* last part (mandatory) will be a command */
253 if (cmdstr[idx] == '\0')
254 exit_sed(1, "sed: missing command\n");
255 if (!strchr("pds", cmdstr[idx])) /* <-- XXX add new commands here */
256 exit_sed(1, "sed: invalid command\n");
257 sed_cmd->cmd = cmdstr[idx];
258 /* special-case handling for 's' */
259 if (sed_cmd->cmd == 's') {
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000260 int oldidx, cflags = REG_NEWLINE;
261 char *match;
Mark Whitley6315ce62000-07-10 22:55:51 +0000262 /* format for substitution is:
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000263 * s/match/replace/gI
264 * | ||
Mark Whitley6315ce62000-07-10 22:55:51 +0000265 * mandatory optional
266 */
267
268 /* verify that we have an 's' followed by a 'slash' */
269 if (cmdstr[++idx] != '/')
270 exit_sed(1, "sed: bad format in substitution expression\n");
271
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000272 /* save the match string */
273 oldidx = idx+1;
Mark Whitley6315ce62000-07-10 22:55:51 +0000274 idx = index_of_next_unescaped_slash(idx, cmdstr);
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000275 if (idx == -1)
276 exit_sed(1, "sed: bad format in substitution expression\n");
277 match = strdup_substr(cmdstr, oldidx, idx);
Mark Whitley6315ce62000-07-10 22:55:51 +0000278
Mark Whitleydf5f6ba2000-07-11 16:53:56 +0000279 /* save the replacement string */
280 oldidx = idx+1;
281 idx = index_of_next_unescaped_slash(idx, cmdstr);
282 if (idx == -1)
283 exit_sed(1, "sed: bad format in substitution expression\n");
284 sed_cmd->replace = strdup_substr(cmdstr, oldidx, idx);
285
286 /* process the flags */
287 while (cmdstr[++idx]) {
288 switch (cmdstr[idx]) {
289 case 'g':
290 sed_cmd->sub_g = 1;
291 break;
292 case 'I':
293 cflags |= REG_ICASE;
294 break;
295 default:
296 exit_sed(1, "sed: bad option in substitution expression\n");
297 }
298 }
299
300 /* compile the regex */
301 sed_cmd->sub_match = (regex_t *)xmalloc(sizeof(regex_t));
302 if (bb_regcomp(sed_cmd->sub_match, match, cflags) != 0) {
303 free(match);
304 exit_sed(1, NULL);
305 }
306 free(match);
Erik Andersen1266a131999-12-29 22:19:46 +0000307 }
Eric Andersen50d63601999-11-09 01:47:36 +0000308}
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000309
Mark Whitley6315ce62000-07-10 22:55:51 +0000310static void add_cmd_str(const char *cmdstr)
Erik Andersen1266a131999-12-29 22:19:46 +0000311{
Mark Whitley6315ce62000-07-10 22:55:51 +0000312 char *my_cmdstr = trim_str(cmdstr);
Erik Andersen1266a131999-12-29 22:19:46 +0000313
Mark Whitley6315ce62000-07-10 22:55:51 +0000314 /* if this is a comment, don't even bother */
315 if (my_cmdstr[0] == '#') {
316 free(my_cmdstr);
317 return;
318 }
319
320 /* grow the array */
321 sed_cmds = realloc(sed_cmds, sizeof(struct sed_cmd) * (++ncmds));
322 /* zero new element */
323 memset(&sed_cmds[ncmds-1], 0, sizeof(struct sed_cmd));
324 /* load command string into new array element */
325 parse_cmd_str(&sed_cmds[ncmds-1], my_cmdstr);
326}
327
328
329static void load_cmd_file(char *filename)
330{
331 FILE *cmdfile;
332 char *line;
333
334 cmdfile = fopen(filename, "r");
335 if (cmdfile == NULL)
336 exit_sed(1, strerror(errno));
337
338 while ((line = get_line_from_file(cmdfile)) != NULL) {
339 line[strlen(line)-1] = 0; /* eat newline */
340 add_cmd_str(line);
341 free(line);
342 }
343}
344
345
346static int do_sed_command(const struct sed_cmd *sed_cmd, const char *line)
347{
348 int altered = 0;
349
350 switch (sed_cmd->cmd) {
351
352 case 'p':
353 fputs(line, stdout);
354 break;
355
356 case 'd':
357 altered++;
358 break;
359
360 case 's': /* oo, a fun one :-) */
361
362 /* we only substitute if the substitution 'search' expression matches */
363 if (regexec(sed_cmd->sub_match, line, 0, NULL, 0) == 0) {
364 regmatch_t regmatch;
365 int i;
366 char *ptr = (char *)line;
367
368 while (*ptr) {
369 /* if we can match the search string... */
370 if (regexec(sed_cmd->sub_match, ptr, 1, &regmatch, 0) == 0) {
371 /* print everything before the match, */
372 for (i = 0; i < regmatch.rm_so; i++)
373 fputc(ptr[i], stdout);
374 /* then print the substitution in its place */
375 fputs(sed_cmd->replace, stdout);
376 /* then advance past the match */
377 ptr += regmatch.rm_eo;
378 /* and let the calling function know that something
379 * has been changed */
380 altered++;
381
382 /* if we're not doing this globally... */
383 if (!sed_cmd->sub_g)
384 break;
385 }
386 /* if we COULD NOT match the search string (meaning we've gone past
387 * all previous instances), get out */
388 else
389 break;
390 }
391
392 /* is there anything left to print? */
393 if (*ptr)
394 fputs(ptr, stdout);
395 }
396
397 break;
398 }
399
400 return altered;
401}
402
403static void process_file(FILE *file)
404{
405 char *line = NULL;
406 static int linenum = 0; /* GNU sed does not restart counting lines at EOF */
407 unsigned int still_in_range = 0;
408 int line_altered;
409 int i;
410
411 /* go through every line in the file */
412 while ((line = get_line_from_file(file)) != NULL) {
413
414 linenum++;
415 line_altered = 0;
416
417 /* for every line, go through all the commands */
418 for (i = 0; i < ncmds; i++) {
419
420 /* are we acting on a range of matched lines? */
421 if (sed_cmds[i].beg_match && sed_cmds[i].end_match) {
422 if (still_in_range || regexec(sed_cmds[i].beg_match, line, 0, NULL, 0) == 0) {
423 line_altered += do_sed_command(&sed_cmds[i], line);
424 still_in_range = 1;
425 if (regexec(sed_cmds[i].end_match, line, 0, NULL, 0) == 0)
426 still_in_range = 0;
427 }
428 }
429
430 /* are we trying to match a single line? */
431 else if (sed_cmds[i].beg_match) {
432 if (regexec(sed_cmds[i].beg_match, line, 0, NULL, 0) == 0)
433 line_altered += do_sed_command(&sed_cmds[i], line);
434 }
435
436 /* are we acting on a range of line numbers? */
437 else if (sed_cmds[i].beg_line > 0 && sed_cmds[i].end_line > 0) {
438 if (linenum >= sed_cmds[i].beg_line && linenum <= sed_cmds[i].end_line)
439 line_altered += do_sed_command(&sed_cmds[i], line);
440 }
441
442 /* are we acting on a specified line number */
443 else if (sed_cmds[i].beg_line > 0) {
444 if (linenum == sed_cmds[i].beg_line)
445 line_altered += do_sed_command(&sed_cmds[i], line);
446 }
447
448 /* not acting on matches or line numbers. act on every line */
449 else
450 line_altered += do_sed_command(&sed_cmds[i], line);
451
Erik Andersene49d5ec2000-02-08 19:58:47 +0000452 }
Erik Andersen1266a131999-12-29 22:19:46 +0000453
Mark Whitley6315ce62000-07-10 22:55:51 +0000454 /* we will print the line unless we were told to be quiet or if the
455 * line was altered (via a 'd'elete or 's'ubstitution) */
456 if (!be_quiet && !line_altered)
457 fputs(line, stdout);
458
459 free(line);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000460 }
Erik Andersen1266a131999-12-29 22:19:46 +0000461}
462
463extern int sed_main(int argc, char **argv)
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000464{
Mark Whitley6315ce62000-07-10 22:55:51 +0000465 int opt;
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000466
Mark Whitley6315ce62000-07-10 22:55:51 +0000467 /* do special-case option parsing */
468 if (argv[1] && (strcmp(argv[1], "--help") == 0))
Eric Andersenc1525e81999-10-29 00:07:31 +0000469 usage(sed_usage);
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000470
Mark Whitley6315ce62000-07-10 22:55:51 +0000471 /* do normal option parsing */
472 while ((opt = getopt(argc, argv, "Vhne:f:")) > 0) {
473 switch (opt) {
474 case 'V':
475 printf("Print Busybox version here\n");
476 exit(0);
477 break;
478 case 'h':
479 usage(sed_usage);
480 break;
Erik Andersene916d242000-03-06 19:20:35 +0000481 case 'n':
Mark Whitley6315ce62000-07-10 22:55:51 +0000482 be_quiet++;
Erik Andersene916d242000-03-06 19:20:35 +0000483 break;
484 case 'e':
Mark Whitley6315ce62000-07-10 22:55:51 +0000485 add_cmd_str(optarg);
Erik Andersene916d242000-03-06 19:20:35 +0000486 break;
Mark Whitley6315ce62000-07-10 22:55:51 +0000487 case 'f':
488 load_cmd_file(optarg);
Erik Andersene49d5ec2000-02-08 19:58:47 +0000489 break;
Erik Andersene49d5ec2000-02-08 19:58:47 +0000490 }
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000491 }
Mark Whitley6315ce62000-07-10 22:55:51 +0000492
493 /* if we didn't get a pattern from a -e and no command file was specified,
494 * argv[optind] should be the pattern. no pattern, no worky */
495 if (ncmds == 0) {
496 if (argv[optind] == NULL)
497 usage(sed_usage);
498 else {
499 add_cmd_str(argv[optind]);
500 optind++;
501 }
502 }
503
504
505 /* argv[(optind)..(argc-1)] should be names of file to process. If no
506 * files were specified or '-' was specified, take input from stdin.
507 * Otherwise, we process all the files specified. */
508 if (argv[optind] == NULL || (strcmp(argv[optind], "-") == 0)) {
509 process_file(stdin);
510 }
511 else {
512 int i;
513 FILE *file;
514 for (i = optind; i < argc; i++) {
515 file = fopen(argv[i], "r");
516 if (file == NULL) {
517 fprintf(stderr, "sed: %s: %s\n", argv[i], strerror(errno));
518 } else {
519 process_file(file);
520 fclose(file);
521 }
522 }
523 }
524
525 exit_sed(0, NULL);
526
527 /* not reached */
528 return 0;
Eric Andersen6b6b3f61999-10-28 16:06:25 +0000529}