Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 1 | /* vi: set sw=4 ts=4: */ |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 2 | /* |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 3 | * wc implementation for busybox |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 4 | * |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 5 | * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 6 | * |
"Robert P. J. Day" | 801ab14 | 2006-07-12 07:56:04 +0000 | [diff] [blame] | 7 | * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 8 | */ |
| 9 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 10 | /* BB_AUDIT SUSv3 _NOT_ compliant -- option -m is not currently supported. */ |
| 11 | /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ |
| 12 | |
| 13 | /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) |
| 14 | * |
| 15 | * Rewritten to fix a number of problems and do some size optimizations. |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 16 | * Problems in the previous busybox implementation (besides bloat) included: |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 17 | * 1) broken 'wc -c' optimization (read note below) |
| 18 | * 2) broken handling of '-' args |
| 19 | * 3) no checking of ferror on EOF returns |
| 20 | * 4) isprint() wasn't considered when word counting. |
| 21 | * |
| 22 | * TODO: |
| 23 | * |
| 24 | * When locale support is enabled, count multibyte chars in the '-m' case. |
| 25 | * |
| 26 | * NOTES: |
| 27 | * |
| 28 | * The previous busybox wc attempted an optimization using stat for the |
| 29 | * case of counting chars only. I omitted that because it was broken. |
| 30 | * It didn't take into account the possibility of input coming from a |
| 31 | * pipe, or input from a file with file pointer not at the beginning. |
| 32 | * |
| 33 | * To implement such a speed optimization correctly, not only do you |
| 34 | * need the size, but also the file position. Note also that the |
| 35 | * file position may be past the end of file. Consider the example |
| 36 | * (adapted from example in gnu wc.c) |
| 37 | * |
| 38 | * echo hello > /tmp/testfile && |
Denis Vlasenko | b71c668 | 2007-07-21 15:08:09 +0000 | [diff] [blame] | 39 | * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 40 | * |
| 41 | * for which 'wc -c' should output '0'. |
| 42 | */ |
| 43 | |
Denis Vlasenko | b6adbf1 | 2007-05-26 19:00:18 +0000 | [diff] [blame] | 44 | #include "libbb.h" |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 45 | |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 46 | #if !ENABLE_LOCALE_SUPPORT |
| 47 | # undef isprint |
| 48 | # undef isspace |
| 49 | # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20)) |
| 50 | # define isspace(c) ((c) == ' ') |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 51 | #endif |
| 52 | |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 53 | #if ENABLE_FEATURE_WC_LARGE |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 54 | # define COUNT_T unsigned long long |
| 55 | # define COUNT_FMT "llu" |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 56 | #else |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 57 | # define COUNT_T unsigned |
| 58 | # define COUNT_FMT "u" |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 59 | #endif |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 60 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 61 | enum { |
| 62 | WC_LINES = 0, |
| 63 | WC_WORDS = 1, |
| 64 | WC_CHARS = 2, |
| 65 | WC_LENGTH = 3 |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 66 | }; |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 67 | |
Denis Vlasenko | 9b49a5e | 2007-10-11 10:05:36 +0000 | [diff] [blame] | 68 | int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
Denis Vlasenko | a60f84e | 2008-07-05 09:18:54 +0000 | [diff] [blame] | 69 | int wc_main(int argc UNUSED_PARAM, char **argv) |
Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 70 | { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 71 | const char *arg; |
Denis Vlasenko | e8419c9 | 2008-02-19 00:38:10 +0000 | [diff] [blame] | 72 | const char *start_fmt = " %9"COUNT_FMT + 1; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 73 | const char *fname_fmt = " %s\n"; |
| 74 | COUNT_T *pcounts; |
| 75 | COUNT_T counts[4]; |
| 76 | COUNT_T totals[4]; |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 77 | int num_files; |
Bernhard Reutner-Fischer | d58c194 | 2007-01-20 21:28:36 +0000 | [diff] [blame] | 78 | smallint status = EXIT_SUCCESS; |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 79 | unsigned print_type; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 80 | |
Denis Vlasenko | fe7cd64 | 2007-08-18 15:32:12 +0000 | [diff] [blame] | 81 | print_type = getopt32(argv, "lwcL"); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 82 | |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 83 | if (print_type == 0) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 84 | print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS); |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 85 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 86 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 87 | argv += optind; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 88 | if (!argv[0]) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 89 | *--argv = (char *) bb_msg_standard_input; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 90 | fname_fmt = "\n"; |
Denys Vlasenko | 79950a6 | 2010-03-08 22:03:24 +0100 | [diff] [blame] | 91 | } |
| 92 | if (!argv[1]) { /* zero or one filename? */ |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 93 | if (!((print_type-1) & print_type)) /* exactly one option? */ |
| 94 | start_fmt = "%"COUNT_FMT; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 95 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 96 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 97 | memset(totals, 0, sizeof(totals)); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 98 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 99 | pcounts = counts; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 100 | |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 101 | num_files = 0; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 102 | while ((arg = *argv++) != 0) { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 103 | FILE *fp; |
| 104 | const char *s; |
| 105 | unsigned u; |
| 106 | unsigned linepos; |
| 107 | smallint in_word; |
| 108 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 109 | ++num_files; |
Denis Vlasenko | ddec5af | 2006-10-26 23:25:17 +0000 | [diff] [blame] | 110 | fp = fopen_or_warn_stdin(arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 111 | if (!fp) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 112 | status = EXIT_FAILURE; |
| 113 | continue; |
| 114 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 115 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 116 | memset(counts, 0, sizeof(counts)); |
| 117 | linepos = 0; |
| 118 | in_word = 0; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 119 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 120 | do { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 121 | int c; |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 122 | /* Our -w doesn't match GNU wc exactly... oh well */ |
| 123 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 124 | ++counts[WC_CHARS]; |
| 125 | c = getc(fp); |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 126 | if (c == EOF) { |
| 127 | if (ferror(fp)) { |
| 128 | bb_simple_perror_msg(arg); |
| 129 | status = EXIT_FAILURE; |
| 130 | } |
| 131 | --counts[WC_CHARS]; |
| 132 | goto DO_EOF; /* Treat an EOF as '\r'. */ |
| 133 | } |
Denys Vlasenko | c270454 | 2009-11-20 19:14:19 +0100 | [diff] [blame] | 134 | if (isprint_asciionly(c)) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 135 | ++linepos; |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 136 | if (!isspace(c)) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 137 | in_word = 1; |
| 138 | continue; |
| 139 | } |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 140 | } else if ((unsigned)(c - 9) <= 4) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 141 | /* \t 9 |
| 142 | * \n 10 |
| 143 | * \v 11 |
| 144 | * \f 12 |
| 145 | * \r 13 |
| 146 | */ |
| 147 | if (c == '\t') { |
| 148 | linepos = (linepos | 7) + 1; |
| 149 | } else { /* '\n', '\r', '\f', or '\v' */ |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 150 | DO_EOF: |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 151 | if (linepos > counts[WC_LENGTH]) { |
| 152 | counts[WC_LENGTH] = linepos; |
| 153 | } |
| 154 | if (c == '\n') { |
| 155 | ++counts[WC_LINES]; |
| 156 | } |
| 157 | if (c != '\v') { |
| 158 | linepos = 0; |
| 159 | } |
| 160 | } |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 161 | } else { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 162 | continue; |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 163 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 164 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 165 | counts[WC_WORDS] += in_word; |
| 166 | in_word = 0; |
| 167 | if (c == EOF) { |
| 168 | break; |
| 169 | } |
| 170 | } while (1); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 171 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 172 | if (totals[WC_LENGTH] < counts[WC_LENGTH]) { |
| 173 | totals[WC_LENGTH] = counts[WC_LENGTH]; |
Glenn L McGrath | 9e6c9f7 | 2001-11-21 12:46:36 +0000 | [diff] [blame] | 174 | } |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 175 | totals[WC_LENGTH] -= counts[WC_LENGTH]; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 176 | |
Denis Vlasenko | ddec5af | 2006-10-26 23:25:17 +0000 | [diff] [blame] | 177 | fclose_if_not_stdin(fp); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 178 | |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 179 | OUTPUT: |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 180 | /* coreutils wc tries hard to print pretty columns |
| 181 | * (saves results for all files, find max col len etc...) |
| 182 | * we won't try that hard, it will bloat us too much */ |
| 183 | s = start_fmt; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 184 | u = 0; |
| 185 | do { |
| 186 | if (print_type & (1 << u)) { |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 187 | printf(s, pcounts[u]); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 188 | s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 189 | } |
| 190 | totals[u] += pcounts[u]; |
| 191 | } while (++u < 4); |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 192 | printf(fname_fmt, arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 193 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 194 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 195 | /* If more than one file was processed, we want the totals. To save some |
| 196 | * space, we set the pcounts ptr to the totals array. This has the side |
| 197 | * effect of trashing the totals array after outputting it, but that's |
| 198 | * irrelavent since we no longer need it. */ |
| 199 | if (num_files > 1) { |
| 200 | num_files = 0; /* Make sure we don't get here again. */ |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 201 | arg = "total"; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 202 | pcounts = totals; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 203 | --argv; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 204 | goto OUTPUT; |
Mark Whitley | 3950596 | 2000-07-20 00:08:10 +0000 | [diff] [blame] | 205 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 206 | |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 207 | fflush_stdout_and_exit(status); |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 208 | } |