Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 1 | /* vi: set sw=4 ts=4: */ |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 2 | /* |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 3 | * wc implementation for busybox |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 4 | * |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 5 | * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 6 | * |
"Robert P. J. Day" | 801ab14 | 2006-07-12 07:56:04 +0000 | [diff] [blame] | 7 | * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 8 | */ |
| 9 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 10 | /* BB_AUDIT SUSv3 _NOT_ compliant -- option -m is not currently supported. */ |
| 11 | /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ |
| 12 | |
| 13 | /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) |
| 14 | * |
| 15 | * Rewritten to fix a number of problems and do some size optimizations. |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 16 | * Problems in the previous busybox implementation (besides bloat) included: |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 17 | * 1) broken 'wc -c' optimization (read note below) |
| 18 | * 2) broken handling of '-' args |
| 19 | * 3) no checking of ferror on EOF returns |
| 20 | * 4) isprint() wasn't considered when word counting. |
| 21 | * |
| 22 | * TODO: |
| 23 | * |
| 24 | * When locale support is enabled, count multibyte chars in the '-m' case. |
| 25 | * |
| 26 | * NOTES: |
| 27 | * |
| 28 | * The previous busybox wc attempted an optimization using stat for the |
| 29 | * case of counting chars only. I omitted that because it was broken. |
| 30 | * It didn't take into account the possibility of input coming from a |
| 31 | * pipe, or input from a file with file pointer not at the beginning. |
| 32 | * |
| 33 | * To implement such a speed optimization correctly, not only do you |
| 34 | * need the size, but also the file position. Note also that the |
| 35 | * file position may be past the end of file. Consider the example |
| 36 | * (adapted from example in gnu wc.c) |
| 37 | * |
| 38 | * echo hello > /tmp/testfile && |
Denis Vlasenko | b71c668 | 2007-07-21 15:08:09 +0000 | [diff] [blame] | 39 | * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 40 | * |
| 41 | * for which 'wc -c' should output '0'. |
| 42 | */ |
| 43 | |
Denis Vlasenko | b6adbf1 | 2007-05-26 19:00:18 +0000 | [diff] [blame] | 44 | #include "libbb.h" |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 45 | |
Denis Vlasenko | dbe6e66 | 2007-08-14 16:43:01 +0000 | [diff] [blame] | 46 | #if ENABLE_LOCALE_SUPPORT |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 47 | #define isspace_given_isprint(c) isspace(c) |
| 48 | #else |
| 49 | #undef isspace |
| 50 | #undef isprint |
| 51 | #define isspace(c) ((((c) == ' ') || (((unsigned int)((c) - 9)) <= (13 - 9)))) |
| 52 | #define isprint(c) (((unsigned int)((c) - 0x20)) <= (0x7e - 0x20)) |
| 53 | #define isspace_given_isprint(c) ((c) == ' ') |
| 54 | #endif |
| 55 | |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 56 | #if ENABLE_FEATURE_WC_LARGE |
| 57 | #define COUNT_T unsigned long long |
| 58 | #define COUNT_FMT "llu" |
| 59 | #else |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 60 | #define COUNT_T unsigned |
| 61 | #define COUNT_FMT "u" |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 62 | #endif |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 63 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 64 | enum { |
| 65 | WC_LINES = 0, |
| 66 | WC_WORDS = 1, |
| 67 | WC_CHARS = 2, |
| 68 | WC_LENGTH = 3 |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 69 | }; |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 70 | |
Denis Vlasenko | 9b49a5e | 2007-10-11 10:05:36 +0000 | [diff] [blame] | 71 | int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
Denis Vlasenko | 68404f1 | 2008-03-17 09:00:54 +0000 | [diff] [blame] | 72 | int wc_main(int argc ATTRIBUTE_UNUSED, char **argv) |
Erik Andersen | e49d5ec | 2000-02-08 19:58:47 +0000 | [diff] [blame] | 73 | { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 74 | FILE *fp; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 75 | const char *s, *arg; |
Denis Vlasenko | e8419c9 | 2008-02-19 00:38:10 +0000 | [diff] [blame] | 76 | const char *start_fmt = " %9"COUNT_FMT + 1; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 77 | const char *fname_fmt = " %s\n"; |
| 78 | COUNT_T *pcounts; |
| 79 | COUNT_T counts[4]; |
| 80 | COUNT_T totals[4]; |
| 81 | unsigned linepos; |
| 82 | unsigned u; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 83 | int num_files = 0; |
| 84 | int c; |
Bernhard Reutner-Fischer | d58c194 | 2007-01-20 21:28:36 +0000 | [diff] [blame] | 85 | smallint status = EXIT_SUCCESS; |
| 86 | smallint in_word; |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 87 | unsigned print_type; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 88 | |
Denis Vlasenko | fe7cd64 | 2007-08-18 15:32:12 +0000 | [diff] [blame] | 89 | print_type = getopt32(argv, "lwcL"); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 90 | |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 91 | if (print_type == 0) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 92 | print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS); |
Glenn L McGrath | 02d090d | 2001-11-21 09:17:00 +0000 | [diff] [blame] | 93 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 94 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 95 | argv += optind; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 96 | if (!argv[0]) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 97 | *--argv = (char *) bb_msg_standard_input; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 98 | fname_fmt = "\n"; |
| 99 | if (!((print_type-1) & print_type)) /* exactly one option? */ |
| 100 | start_fmt = "%"COUNT_FMT; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 101 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 102 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 103 | memset(totals, 0, sizeof(totals)); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 104 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 105 | pcounts = counts; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 106 | |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 107 | while ((arg = *argv++) != 0) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 108 | ++num_files; |
Denis Vlasenko | ddec5af | 2006-10-26 23:25:17 +0000 | [diff] [blame] | 109 | fp = fopen_or_warn_stdin(arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 110 | if (!fp) { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 111 | status = EXIT_FAILURE; |
| 112 | continue; |
| 113 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 114 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 115 | memset(counts, 0, sizeof(counts)); |
| 116 | linepos = 0; |
| 117 | in_word = 0; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 118 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 119 | do { |
Denis Vlasenko | 7021016 | 2006-09-29 23:41:59 +0000 | [diff] [blame] | 120 | /* Our -w doesn't match GNU wc exactly... oh well */ |
| 121 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 122 | ++counts[WC_CHARS]; |
| 123 | c = getc(fp); |
| 124 | if (isprint(c)) { |
| 125 | ++linepos; |
| 126 | if (!isspace_given_isprint(c)) { |
| 127 | in_word = 1; |
| 128 | continue; |
| 129 | } |
| 130 | } else if (((unsigned int)(c - 9)) <= 4) { |
| 131 | /* \t 9 |
| 132 | * \n 10 |
| 133 | * \v 11 |
| 134 | * \f 12 |
| 135 | * \r 13 |
| 136 | */ |
| 137 | if (c == '\t') { |
| 138 | linepos = (linepos | 7) + 1; |
| 139 | } else { /* '\n', '\r', '\f', or '\v' */ |
| 140 | DO_EOF: |
| 141 | if (linepos > counts[WC_LENGTH]) { |
| 142 | counts[WC_LENGTH] = linepos; |
| 143 | } |
| 144 | if (c == '\n') { |
| 145 | ++counts[WC_LINES]; |
| 146 | } |
| 147 | if (c != '\v') { |
| 148 | linepos = 0; |
| 149 | } |
| 150 | } |
| 151 | } else if (c == EOF) { |
| 152 | if (ferror(fp)) { |
Denis Vlasenko | 0c97c9d | 2007-10-01 11:58:38 +0000 | [diff] [blame] | 153 | bb_simple_perror_msg(arg); |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 154 | status = EXIT_FAILURE; |
| 155 | } |
| 156 | --counts[WC_CHARS]; |
| 157 | goto DO_EOF; /* Treat an EOF as '\r'. */ |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 158 | } else { |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 159 | continue; |
Glenn L McGrath | 74afa9a | 2001-11-21 10:26:28 +0000 | [diff] [blame] | 160 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 161 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 162 | counts[WC_WORDS] += in_word; |
| 163 | in_word = 0; |
| 164 | if (c == EOF) { |
| 165 | break; |
| 166 | } |
| 167 | } while (1); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 168 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 169 | if (totals[WC_LENGTH] < counts[WC_LENGTH]) { |
| 170 | totals[WC_LENGTH] = counts[WC_LENGTH]; |
Glenn L McGrath | 9e6c9f7 | 2001-11-21 12:46:36 +0000 | [diff] [blame] | 171 | } |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 172 | totals[WC_LENGTH] -= counts[WC_LENGTH]; |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 173 | |
Denis Vlasenko | ddec5af | 2006-10-26 23:25:17 +0000 | [diff] [blame] | 174 | fclose_if_not_stdin(fp); |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 175 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 176 | OUTPUT: |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 177 | /* coreutils wc tries hard to print pretty columns |
| 178 | * (saves results for all files, find max col len etc...) |
| 179 | * we won't try that hard, it will bloat us too much */ |
| 180 | s = start_fmt; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 181 | u = 0; |
| 182 | do { |
| 183 | if (print_type & (1 << u)) { |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 184 | printf(s, pcounts[u]); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 185 | s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 186 | } |
| 187 | totals[u] += pcounts[u]; |
| 188 | } while (++u < 4); |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 189 | printf(fname_fmt, arg); |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 190 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 191 | |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 192 | /* If more than one file was processed, we want the totals. To save some |
| 193 | * space, we set the pcounts ptr to the totals array. This has the side |
| 194 | * effect of trashing the totals array after outputting it, but that's |
| 195 | * irrelavent since we no longer need it. */ |
| 196 | if (num_files > 1) { |
| 197 | num_files = 0; /* Make sure we don't get here again. */ |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 198 | arg = "total"; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 199 | pcounts = totals; |
Denis Vlasenko | 3ed001f | 2006-09-29 23:41:04 +0000 | [diff] [blame] | 200 | --argv; |
Manuel Novoa III | cad5364 | 2003-03-19 09:13:01 +0000 | [diff] [blame] | 201 | goto OUTPUT; |
Mark Whitley | 3950596 | 2000-07-20 00:08:10 +0000 | [diff] [blame] | 202 | } |
Eric Andersen | c7bda1c | 2004-03-15 08:29:22 +0000 | [diff] [blame] | 203 | |
Denis Vlasenko | f0ed376 | 2006-10-26 23:21:47 +0000 | [diff] [blame] | 204 | fflush_stdout_and_exit(status); |
Erik Andersen | 3163821 | 2000-01-15 22:28:50 +0000 | [diff] [blame] | 205 | } |