Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 1 | /* vi: set sw=4 ts=4: */ |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 2 | /* |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 3 | * wc implementation for busybox |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 4 | * |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 5 | * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 6 | * |
Denys Vlasenko | 0ef64bd | 2010-08-16 20:14:46 +0200 | [diff] [blame] | 7 | * Licensed under GPLv2 or later, see file LICENSE in this source tree. |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 8 | */ |
| 9 | |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 10 | /* BB_AUDIT SUSv3 compliant. */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 11 | /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ |
| 12 | |
| 13 | /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) |
| 14 | * |
| 15 | * Rewritten to fix a number of problems and do some size optimizations. |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 16 | * Problems in the previous busybox implementation (besides bloat) included: |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 17 | * 1) broken 'wc -c' optimization (read note below) |
| 18 | * 2) broken handling of '-' args |
| 19 | * 3) no checking of ferror on EOF returns |
| 20 | * 4) isprint() wasn't considered when word counting. |
| 21 | * |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 22 | * NOTES: |
| 23 | * |
| 24 | * The previous busybox wc attempted an optimization using stat for the |
| 25 | * case of counting chars only. I omitted that because it was broken. |
| 26 | * It didn't take into account the possibility of input coming from a |
| 27 | * pipe, or input from a file with file pointer not at the beginning. |
| 28 | * |
| 29 | * To implement such a speed optimization correctly, not only do you |
| 30 | * need the size, but also the file position. Note also that the |
| 31 | * file position may be past the end of file. Consider the example |
| 32 | * (adapted from example in gnu wc.c) |
| 33 | * |
| 34 | * echo hello > /tmp/testfile && |
Denis Vlasenko | b71c668 | 2007-07-21 15:08:09 +0000 | [diff] [blame] | 35 | * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 36 | * |
| 37 | * for which 'wc -c' should output '0'. |
| 38 | */ |
Denis Vlasenko | b6adbf1 | 2007-05-26 19:00:18 +0000 | [diff] [blame] | 39 | #include "libbb.h" |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 40 | #include "unicode.h" |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 41 | |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 42 | #if !ENABLE_LOCALE_SUPPORT |
| 43 | # undef isprint |
| 44 | # undef isspace |
| 45 | # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20)) |
| 46 | # define isspace(c) ((c) == ' ') |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 47 | #endif |
| 48 | |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 49 | #if ENABLE_FEATURE_WC_LARGE |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 50 | # define COUNT_T unsigned long long |
| 51 | # define COUNT_FMT "llu" |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 52 | #else |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 53 | # define COUNT_T unsigned |
| 54 | # define COUNT_FMT "u" |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 55 | #endif |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 56 | |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 57 | /* We support -m even when UNICODE_SUPPORT is off, |
| 58 | * we just don't advertise it in help text, |
| 59 | * since it is the same as -c in this case. |
| 60 | */ |
| 61 | |
| 62 | //usage:#define wc_trivial_usage |
| 63 | //usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..." |
| 64 | //usage: |
| 65 | //usage:#define wc_full_usage "\n\n" |
| 66 | //usage: "Count lines, words, and bytes for each FILE (or stdin)\n" |
| 67 | //usage: "\nOptions:" |
| 68 | //usage: "\n -c Count bytes" |
| 69 | //usage: IF_UNICODE_SUPPORT( |
| 70 | //usage: "\n -m Count characters" |
| 71 | //usage: ) |
| 72 | //usage: "\n -l Count newlines" |
| 73 | //usage: "\n -w Count words" |
| 74 | //usage: "\n -L Print longest line length" |
| 75 | //usage: |
| 76 | //usage:#define wc_example_usage |
| 77 | //usage: "$ wc /etc/passwd\n" |
| 78 | //usage: " 31 46 1365 /etc/passwd\n" |
| 79 | |
| 80 | /* Order is important if we want to be compatible with |
| 81 | * column order in "wc -cmlwL" output: |
| 82 | */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 83 | enum { |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 84 | WC_LINES = 0, |
| 85 | WC_WORDS = 1, |
| 86 | WC_UNICHARS = 2, |
| 87 | WC_CHARS = 3, |
| 88 | WC_LENGTH = 4, |
| 89 | NUM_WCS = 5, |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 90 | }; |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 91 | |
Denis Vlasenko | 9b49a5e | 2007-10-11 10:05:36 +0000 | [diff] [blame] | 92 | int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
Denis Vlasenko | a60f84e | 2008-07-05 09:18:54 +0000 | [diff] [blame] | 93 | int wc_main(int argc UNUSED_PARAM, char **argv) |
Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 94 | { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 95 | const char *arg; |
Denis Vlasenko | e8419c9 | 2008-02-19 00:38:10 +0000 | [diff] [blame] | 96 | const char *start_fmt = " %9"COUNT_FMT + 1; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 97 | const char *fname_fmt = " %s\n"; |
| 98 | COUNT_T *pcounts; |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 99 | COUNT_T counts[NUM_WCS]; |
| 100 | COUNT_T totals[NUM_WCS]; |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 101 | int num_files; |
Bernhard Reutner-Fischer | d58c194 | 2007-01-20 21:28:36 +0000 | [diff] [blame] | 102 | smallint status = EXIT_SUCCESS; |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 103 | unsigned print_type; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 104 | |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 105 | init_unicode(); |
| 106 | |
| 107 | print_type = getopt32(argv, "lwcmL"); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 108 | |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 109 | if (print_type == 0) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 110 | print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS); |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 111 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 112 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 113 | argv += optind; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 114 | if (!argv[0]) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 115 | *--argv = (char *) bb_msg_standard_input; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 116 | fname_fmt = "\n"; |
Denys Vlasenko | 79950a6 | 2010-03-08 22:03:24 +0100 | [diff] [blame] | 117 | } |
| 118 | if (!argv[1]) { /* zero or one filename? */ |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 119 | if (!((print_type-1) & print_type)) /* exactly one option? */ |
| 120 | start_fmt = "%"COUNT_FMT; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 121 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 122 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 123 | memset(totals, 0, sizeof(totals)); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 124 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 125 | pcounts = counts; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 126 | |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 127 | num_files = 0; |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 128 | while ((arg = *argv++) != NULL) { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 129 | FILE *fp; |
| 130 | const char *s; |
| 131 | unsigned u; |
| 132 | unsigned linepos; |
| 133 | smallint in_word; |
| 134 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 135 | ++num_files; |
Denis Vlasenko | ddec5af | 2006-10-26 23:25:17 +0000 | [diff] [blame] | 136 | fp = fopen_or_warn_stdin(arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 137 | if (!fp) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 138 | status = EXIT_FAILURE; |
| 139 | continue; |
| 140 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 141 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 142 | memset(counts, 0, sizeof(counts)); |
| 143 | linepos = 0; |
| 144 | in_word = 0; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 145 | |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 146 | while (1) { |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 147 | int c; |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 148 | /* Our -w doesn't match GNU wc exactly... oh well */ |
| 149 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 150 | c = getc(fp); |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 151 | if (c == EOF) { |
| 152 | if (ferror(fp)) { |
| 153 | bb_simple_perror_msg(arg); |
| 154 | status = EXIT_FAILURE; |
| 155 | } |
Denys Vlasenko | fb132e4 | 2010-10-29 11:46:52 +0200 | [diff] [blame] | 156 | goto DO_EOF; /* Treat an EOF as '\r'. */ |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 157 | } |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 158 | |
Denys Vlasenko | afc7b4c | 2010-10-04 17:08:14 +0200 | [diff] [blame] | 159 | /* Cater for -c and -m */ |
| 160 | ++counts[WC_CHARS]; |
| 161 | if (unicode_status != UNICODE_ON /* every byte is a new char */ |
| 162 | || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */ |
| 163 | ) { |
| 164 | ++counts[WC_UNICHARS]; |
| 165 | } |
| 166 | |
| 167 | if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 168 | ++linepos; |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 169 | if (!isspace(c)) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 170 | in_word = 1; |
| 171 | continue; |
| 172 | } |
Denys Vlasenko | c0dab37 | 2009-10-22 22:28:08 +0200 | [diff] [blame] | 173 | } else if ((unsigned)(c - 9) <= 4) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 174 | /* \t 9 |
| 175 | * \n 10 |
| 176 | * \v 11 |
| 177 | * \f 12 |
| 178 | * \r 13 |
| 179 | */ |
| 180 | if (c == '\t') { |
| 181 | linepos = (linepos | 7) + 1; |
Denys Vlasenko | fb132e4 | 2010-10-29 11:46:52 +0200 | [diff] [blame] | 182 | } else { /* '\n', '\r', '\f', or '\v' */ |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 183 | DO_EOF: |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 184 | if (linepos > counts[WC_LENGTH]) { |
| 185 | counts[WC_LENGTH] = linepos; |
| 186 | } |
| 187 | if (c == '\n') { |
| 188 | ++counts[WC_LINES]; |
| 189 | } |
| 190 | if (c != '\v') { |
| 191 | linepos = 0; |
| 192 | } |
| 193 | } |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 194 | } else { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 195 | continue; |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 196 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 197 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 198 | counts[WC_WORDS] += in_word; |
| 199 | in_word = 0; |
| 200 | if (c == EOF) { |
| 201 | break; |
| 202 | } |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 203 | } |
| 204 | |
| 205 | fclose_if_not_stdin(fp); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 206 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 207 | if (totals[WC_LENGTH] < counts[WC_LENGTH]) { |
| 208 | totals[WC_LENGTH] = counts[WC_LENGTH]; |
Glenn L McGrath | 9e6c9f7 | 2001-11-21 12:46:36 +0000 | [diff] [blame] | 209 | } |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 210 | totals[WC_LENGTH] -= counts[WC_LENGTH]; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 211 | |
Dan Fandrich | 5b0a7f1 | 2009-11-18 10:48:09 +0100 | [diff] [blame] | 212 | OUTPUT: |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 213 | /* coreutils wc tries hard to print pretty columns |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 214 | * (saves results for all files, finds max col len etc...) |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 215 | * we won't try that hard, it will bloat us too much */ |
| 216 | s = start_fmt; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 217 | u = 0; |
| 218 | do { |
| 219 | if (print_type & (1 << u)) { |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 220 | printf(s, pcounts[u]); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 221 | s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 222 | } |
| 223 | totals[u] += pcounts[u]; |
Denys Vlasenko | 09e7daf | 2010-10-04 17:04:20 +0200 | [diff] [blame] | 224 | } while (++u < NUM_WCS); |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 225 | printf(fname_fmt, arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 226 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 227 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 228 | /* If more than one file was processed, we want the totals. To save some |
| 229 | * space, we set the pcounts ptr to the totals array. This has the side |
| 230 | * effect of trashing the totals array after outputting it, but that's |
| 231 | * irrelavent since we no longer need it. */ |
| 232 | if (num_files > 1) { |
Denys Vlasenko | fb132e4 | 2010-10-29 11:46:52 +0200 | [diff] [blame] | 233 | num_files = 0; /* Make sure we don't get here again. */ |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 234 | arg = "total"; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 235 | pcounts = totals; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 236 | --argv; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 237 | goto OUTPUT; |
Mark Whitley | 3950596 | 2000-07-20 00:08:10 +0000 | [diff] [blame] | 238 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 239 | |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 240 | fflush_stdout_and_exit(status); |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 241 | } |