1 |
#ifndef lint |
2 |
static const char RCSid[] = "$Id: rcollate.c,v 2.7 2013/11/18 22:02:12 greg Exp $"; |
3 |
#endif |
4 |
/* |
5 |
* Utility to re-order records in a binary or ASCII data file (matrix) |
6 |
*/ |
7 |
|
8 |
#include <stdlib.h> |
9 |
#include <string.h> |
10 |
#include <ctype.h> |
11 |
#include "platform.h" |
12 |
#include "rtio.h" |
13 |
#include "resolu.h" |
14 |
#ifdef _WIN32 |
15 |
#undef ftello |
16 |
#define ftello ftell |
17 |
#undef ssize_t |
18 |
#define ssize_t size_t |
19 |
#else |
20 |
#include <sys/mman.h> |
21 |
#endif |
22 |
|
23 |
#ifdef getc_unlocked /* avoid horrendous overhead of flockfile */ |
24 |
#undef getc |
25 |
#undef putc |
26 |
#define getc getc_unlocked |
27 |
#define putc putc_unlocked |
28 |
#endif |
29 |
|
30 |
typedef struct { |
31 |
void *base; /* pointer to base memory */ |
32 |
size_t len; /* allocated memory length */ |
33 |
int mapped; /* memory-mapped file? */ |
34 |
} MEMLOAD; /* file loaded/mapped into memory */ |
35 |
|
36 |
typedef struct { |
37 |
int nw_rec; /* number of words per record */ |
38 |
int nrecs; /* number of records we found */ |
39 |
char *rec[1]; /* record array (extends struct) */ |
40 |
} RECINDEX; |
41 |
|
42 |
int warnings = 1; /* report warnings? */ |
43 |
|
44 |
/* free loaded file */ |
45 |
static void |
46 |
free_load(MEMLOAD *mp) |
47 |
{ |
48 |
if (mp == NULL || (mp->base == NULL) | (mp->len <= 0)) |
49 |
return; |
50 |
#ifdef MAP_FILE |
51 |
if (mp->mapped) |
52 |
munmap(mp->base, mp->len); |
53 |
else |
54 |
#endif |
55 |
free(mp->base); |
56 |
mp->base = NULL; |
57 |
mp->len = 0; |
58 |
} |
59 |
|
60 |
/* load a file into memory */ |
61 |
static int |
62 |
load_file(MEMLOAD *mp, FILE *fp) |
63 |
{ |
64 |
int fd; |
65 |
off_t skip, flen; |
66 |
|
67 |
if (mp == NULL) |
68 |
return(-1); |
69 |
mp->base = NULL; |
70 |
mp->len = 0; |
71 |
mp->mapped = 0; |
72 |
if (fp == NULL) |
73 |
return(-1); |
74 |
fd = fileno(fp); |
75 |
skip = ftello(fp); |
76 |
flen = lseek(fd, 0, SEEK_END); |
77 |
if (flen <= skip) |
78 |
return((int)(flen - skip)); |
79 |
mp->len = (size_t)(flen - skip); |
80 |
#ifdef MAP_FILE |
81 |
if (mp->len > 1L<<20) { /* map file if > 1 MByte */ |
82 |
mp->base = mmap(NULL, mp->len, PROT_READ, MAP_PRIVATE, fd, skip); |
83 |
if (mp->base != MAP_FAILED) { |
84 |
mp->mapped = 1; |
85 |
return(1); /* mmap() success */ |
86 |
} |
87 |
mp->base = NULL; /* fall back to reading it in... */ |
88 |
} |
89 |
#endif |
90 |
if (lseek(fd, skip, SEEK_SET) != skip || |
91 |
(mp->base = malloc(mp->len)) == NULL) { |
92 |
mp->len = 0; |
93 |
return(-1); |
94 |
} |
95 |
if (read(fd, (char *)mp->base, mp->len) != mp->len) { |
96 |
free_load(mp); |
97 |
return(-1); |
98 |
} |
99 |
return(1); |
100 |
} |
101 |
|
102 |
/* load memory from an input stream, starting from current position */ |
103 |
static int |
104 |
load_stream(MEMLOAD *mp, FILE *fp) |
105 |
{ |
106 |
size_t alloced = 0; |
107 |
char buf[8192]; |
108 |
size_t nr; |
109 |
|
110 |
if (mp == NULL) |
111 |
return(-1); |
112 |
mp->base = NULL; |
113 |
mp->len = 0; |
114 |
mp->mapped = 0; |
115 |
if (fp == NULL) |
116 |
return(-1); |
117 |
while ((nr = fread(buf, 1, sizeof(buf), fp)) > 0) { |
118 |
if (!alloced) |
119 |
mp->base = malloc(nr); |
120 |
else if (mp->len+nr > alloced) |
121 |
mp->base = realloc(mp->base, |
122 |
alloced = alloced*(2+(nr==sizeof(buf)))/2+nr); |
123 |
if (mp->base == NULL) |
124 |
return(-1); |
125 |
memcpy((char *)mp->base + mp->len, buf, nr); |
126 |
mp->len += nr; |
127 |
} |
128 |
if (ferror(fp)) { |
129 |
free_load(mp); |
130 |
return(-1); |
131 |
} |
132 |
if (alloced > mp->len*5/4) /* don't waste too much space */ |
133 |
mp->base = realloc(mp->base, mp->len); |
134 |
return(mp->len > 0); |
135 |
} |
136 |
|
137 |
/* free a record index */ |
138 |
#define free_records(rp) free(rp) |
139 |
|
140 |
/* compute record index */ |
141 |
static RECINDEX * |
142 |
index_records(const MEMLOAD *mp, int nw_rec) |
143 |
{ |
144 |
RECINDEX *rp; |
145 |
char *cp, *mend; |
146 |
int n; |
147 |
|
148 |
if (mp == NULL || (mp->base == NULL) | (mp->len <= 0)) |
149 |
return(NULL); |
150 |
if (nw_rec <= 0) |
151 |
return(NULL); |
152 |
rp = (RECINDEX *)malloc(sizeof(RECINDEX) + mp->len/(2*nw_rec)*sizeof(char *)); |
153 |
if (rp == NULL) |
154 |
return(NULL); |
155 |
rp->nw_rec = nw_rec; |
156 |
rp->nrecs = 0; |
157 |
cp = (char *)mp->base; |
158 |
mend = cp + mp->len; |
159 |
for ( ; ; ) { /* whitespace-separated words */ |
160 |
while (cp < mend && !*cp | isspace(*cp)) |
161 |
++cp; |
162 |
if (cp >= mend) |
163 |
break; |
164 |
rp->rec[rp->nrecs++] = cp; /* point to first non-white */ |
165 |
n = rp->nw_rec; |
166 |
while (++cp < mend) /* find end of record */ |
167 |
if (!*cp | isspace(*cp)) { |
168 |
if (--n <= 0) |
169 |
break; /* got requisite # words */ |
170 |
do { /* else find next word */ |
171 |
if (*cp == '\n') { |
172 |
fprintf(stderr, |
173 |
"Unexpected EOL in record!\n"); |
174 |
free_records(rp); |
175 |
return(NULL); |
176 |
} |
177 |
if (++cp >= mend) |
178 |
break; |
179 |
} while (!*cp | isspace(*cp)); |
180 |
} |
181 |
} |
182 |
rp->rec[rp->nrecs] = mend; /* reallocate to save space */ |
183 |
rp = (RECINDEX *)realloc(rp, |
184 |
sizeof(RECINDEX) + rp->nrecs*sizeof(char *)); |
185 |
return(rp); |
186 |
} |
187 |
|
188 |
/* count number of columns based on first EOL */ |
189 |
static int |
190 |
count_columns(const RECINDEX *rp) |
191 |
{ |
192 |
char *cp = rp->rec[0]; |
193 |
char *mend = rp->rec[rp->nrecs]; |
194 |
int i; |
195 |
|
196 |
while (*cp != '\n') |
197 |
if (++cp >= mend) |
198 |
return(0); |
199 |
for (i = 0; i < rp->nrecs; i++) |
200 |
if (rp->rec[i] >= cp) |
201 |
break; |
202 |
return(i); |
203 |
} |
204 |
|
205 |
/* copy nth record from index to stdout */ |
206 |
static int |
207 |
print_record(const RECINDEX *rp, int n) |
208 |
{ |
209 |
int words2go = rp->nw_rec; |
210 |
char *scp; |
211 |
|
212 |
if ((n < 0) | (n >= rp->nrecs)) |
213 |
return(0); |
214 |
scp = rp->rec[n]; |
215 |
do { |
216 |
putc(*scp++, stdout); |
217 |
if (!*scp | isspace(*scp)) { |
218 |
if (--words2go <= 0) |
219 |
break; |
220 |
putc(' ', stdout); /* single space btwn. words */ |
221 |
do |
222 |
if (++scp >= rp->rec[n+1]) |
223 |
break; |
224 |
while (!*scp | isspace(*scp)); |
225 |
} |
226 |
} while (scp < rp->rec[n+1]); |
227 |
/* caller adds record sep. */ |
228 |
return(1); |
229 |
} |
230 |
|
231 |
/* copy a stream to stdout */ |
232 |
static int |
233 |
output_stream(FILE *fp) |
234 |
{ |
235 |
char buf[8192]; |
236 |
ssize_t n; |
237 |
|
238 |
if (fp == NULL) |
239 |
return(0); |
240 |
fflush(stdout); /* assumes nothing in input buffer */ |
241 |
while ((n = read(fileno(fp), buf, sizeof(buf))) > 0) |
242 |
if (write(fileno(stdout), buf, n) != n) |
243 |
return(0); |
244 |
return(n >= 0); |
245 |
} |
246 |
|
247 |
/* get next word from stream, leaving stream on EOL or start of next word */ |
248 |
static char * |
249 |
fget_word(char buf[256], FILE *fp) |
250 |
{ |
251 |
int c; |
252 |
char *cp; |
253 |
/* skip nul's and white space */ |
254 |
while (!(c = getc(fp)) || isspace(c)) |
255 |
; |
256 |
if (c == EOF) |
257 |
return(NULL); |
258 |
cp = buf; |
259 |
do |
260 |
*cp++ = c; |
261 |
while ((c = getc(fp)) != EOF && !isspace(c) && cp < buf+255); |
262 |
*cp = '\0'; |
263 |
while (isspace(c) & (c != '\n')) |
264 |
c = getc(fp); |
265 |
if (c != EOF) |
266 |
ungetc(c, fp); |
267 |
return(buf); |
268 |
} |
269 |
|
270 |
char *fmtid = "ascii"; /* format id */ |
271 |
int record_width = 3; /* words/record (<0 binary) */ |
272 |
int ni_columns = 0; /* number of input columns */ |
273 |
int ni_rows = 0; /* number of input rows */ |
274 |
int no_columns = 0; /* number of output columns */ |
275 |
int no_rows = 0; /* number of output rows */ |
276 |
|
277 |
/* output transposed ASCII or binary data from memory */ |
278 |
static int |
279 |
do_transpose(const MEMLOAD *mp) |
280 |
{ |
281 |
static const char tabEOL[2] = {'\t','\n'}; |
282 |
RECINDEX *rp = NULL; |
283 |
long nrecords; |
284 |
int i, j; |
285 |
/* propogate sizes */ |
286 |
if (ni_rows <= 0) |
287 |
ni_rows = no_columns; |
288 |
if (ni_columns <= 0) |
289 |
ni_columns = no_rows; |
290 |
/* get # records (& index) */ |
291 |
if (record_width > 0) { |
292 |
if ((rp = index_records(mp, record_width)) == NULL) |
293 |
return(0); |
294 |
if (ni_columns <= 0) |
295 |
ni_columns = count_columns(rp); |
296 |
nrecords = rp->nrecs; |
297 |
} else if ((ni_rows > 0) & (ni_columns > 0)) { |
298 |
nrecords = ni_rows*ni_columns; |
299 |
if (nrecords > mp->len / -record_width) { |
300 |
fprintf(stderr, |
301 |
"Input too small for specified size and type\n"); |
302 |
return(0); |
303 |
} |
304 |
} else |
305 |
nrecords = mp->len / -record_width; |
306 |
/* check sizes */ |
307 |
if ((ni_rows <= 0) & (ni_columns > 0)) |
308 |
ni_rows = nrecords/ni_columns; |
309 |
if ((ni_columns <= 0) & (ni_rows > 0)) |
310 |
ni_columns = nrecords/ni_rows; |
311 |
if (nrecords != ni_rows*ni_columns) |
312 |
goto badspec; |
313 |
if (no_columns <= 0) |
314 |
no_columns = ni_rows; |
315 |
if (no_rows <= 0) |
316 |
no_rows = ni_columns; |
317 |
if ((no_rows != ni_columns) | (no_columns != ni_rows)) |
318 |
goto badspec; |
319 |
/* transpose records */ |
320 |
for (i = 0; i < no_rows; i++) { |
321 |
for (j = 0; j < no_columns; j++) |
322 |
if (rp != NULL) { /* ASCII output */ |
323 |
print_record(rp, j*ni_columns + i); |
324 |
putc(tabEOL[j >= no_columns-1], stdout); |
325 |
} else { /* binary output */ |
326 |
fwrite((char *)mp->base + |
327 |
-record_width*(j*ni_columns + i), |
328 |
-record_width, 1, stdout); |
329 |
} |
330 |
if (ferror(stdout)) { |
331 |
fprintf(stderr, "Error writing to stdout\n"); |
332 |
return(0); |
333 |
} |
334 |
} |
335 |
if (rp != NULL) |
336 |
free_records(rp); |
337 |
return(1); |
338 |
badspec: |
339 |
fprintf(stderr, "Bad transpose specification -- check dimension(s)\n"); |
340 |
return(0); |
341 |
} |
342 |
|
343 |
/* resize ASCII stream input by ignoring EOLs between records */ |
344 |
static int |
345 |
do_resize(FILE *fp) |
346 |
{ |
347 |
long records2go = ni_rows*ni_columns; |
348 |
int columns2go = no_columns; |
349 |
char word[256]; |
350 |
/* sanity checks */ |
351 |
if (record_width <= 0) { |
352 |
fprintf(stderr, "Bad call to do_resize (record_width = %d)\n", |
353 |
record_width); |
354 |
return(0); |
355 |
} |
356 |
if (no_columns <= 0) { |
357 |
fprintf(stderr, "Missing -oc specification\n"); |
358 |
return(0); |
359 |
} |
360 |
if ((records2go <= 0) & (no_rows > 0)) |
361 |
records2go = no_rows*no_columns; |
362 |
else if (no_rows*no_columns != records2go) { |
363 |
fprintf(stderr, |
364 |
"Input and output data sizes disagree (%dx%d != %dx%d)\n", |
365 |
ni_rows, ni_columns, no_rows, no_columns); |
366 |
return(0); |
367 |
} |
368 |
do { /* reshape records */ |
369 |
int n; |
370 |
|
371 |
for (n = record_width; n--; ) { |
372 |
if (fget_word(word, fp) == NULL) { |
373 |
if (records2go > 0 || n < record_width-1) |
374 |
break; |
375 |
goto done; /* normal EOD */ |
376 |
} |
377 |
fputs(word, stdout); |
378 |
if (n) { /* mid-record? */ |
379 |
int c = getc(fp); |
380 |
if ((c == '\n') | (c == EOF)) |
381 |
break; |
382 |
ungetc(c, fp); |
383 |
putc(' ', stdout); |
384 |
} |
385 |
} |
386 |
if (n >= 0) { |
387 |
fprintf(stderr, "Incomplete record / unexpected EOF\n"); |
388 |
return(0); |
389 |
} |
390 |
if (--columns2go <= 0) { /* time to end output row? */ |
391 |
putc('\n', stdout); |
392 |
columns2go = no_columns; |
393 |
} else /* else separate records */ |
394 |
putc('\t', stdout); |
395 |
} while (--records2go); /* expected EOD? */ |
396 |
done: |
397 |
if (warnings && columns2go != no_columns) |
398 |
fprintf(stderr, "Warning -- incomplete final row\n"); |
399 |
if (warnings && fget_word(word, fp) != NULL) |
400 |
fprintf(stderr, "Warning -- characters beyond expected EOD\n"); |
401 |
return(1); |
402 |
} |
403 |
|
404 |
/* process a header line and copy to stdout */ |
405 |
static int |
406 |
headline(char *s, void *p) |
407 |
{ |
408 |
char fmt[32]; |
409 |
|
410 |
if (formatval(fmt, s)) { |
411 |
if (!strcmp(fmt, fmtid)) |
412 |
return(0); |
413 |
fprintf(stderr, "Input format '%s' != '%s'\n", fmt, fmtid); |
414 |
return(-1); |
415 |
} |
416 |
fputs(s, stdout); /* copy header info. */ |
417 |
return(0); |
418 |
} |
419 |
|
420 |
/* main routine for converting rows/columns in data file */ |
421 |
int |
422 |
main(int argc, char *argv[]) |
423 |
{ |
424 |
int do_header = 1; /* header i/o? */ |
425 |
int transpose = 0; /* transpose rows & cols? */ |
426 |
int i; |
427 |
|
428 |
for (i = 1; i < argc && argv[i][0] == '-'; i++) |
429 |
switch (argv[i][1]) { |
430 |
case 'i': /* input */ |
431 |
if (argv[i][2] == 'c') /* columns */ |
432 |
ni_columns = atoi(argv[++i]); |
433 |
else if (argv[i][2] == 'r') |
434 |
ni_rows = atoi(argv[++i]); |
435 |
else |
436 |
goto userr; |
437 |
break; |
438 |
case 'o': /* output */ |
439 |
if (argv[i][2] == 'c') /* columns */ |
440 |
no_columns = atoi(argv[++i]); |
441 |
else if (argv[i][2] == 'r') |
442 |
no_rows = atoi(argv[++i]); |
443 |
else |
444 |
goto userr; |
445 |
break; |
446 |
case 'h': /* header on/off */ |
447 |
do_header = !do_header; |
448 |
break; |
449 |
case 't': /* transpose on/off */ |
450 |
transpose = !transpose; |
451 |
break; |
452 |
case 'f': /* format */ |
453 |
switch (argv[i][2]) { |
454 |
case 'a': /* ASCII */ |
455 |
case 'A': |
456 |
fmtid = "ascii"; |
457 |
record_width = 1; |
458 |
break; |
459 |
case 'f': /* float */ |
460 |
case 'F': |
461 |
fmtid = "float"; |
462 |
record_width = -(int)sizeof(float); |
463 |
break; |
464 |
case 'd': /* double */ |
465 |
case 'D': |
466 |
fmtid = "double"; |
467 |
record_width = -(int)sizeof(double); |
468 |
break; |
469 |
case 'b': /* binary (bytes) */ |
470 |
case 'B': |
471 |
fmtid = "byte"; |
472 |
record_width = -1; |
473 |
break; |
474 |
default: |
475 |
goto userr; |
476 |
} |
477 |
if (argv[i][3]) { |
478 |
if (!isdigit(argv[i][3])) |
479 |
goto userr; |
480 |
record_width *= atoi(argv[i]+3); |
481 |
} |
482 |
break; |
483 |
case 'w': /* warnings on/off */ |
484 |
warnings = !warnings; |
485 |
break; |
486 |
default: |
487 |
goto userr; |
488 |
} |
489 |
if (!record_width) |
490 |
goto userr; |
491 |
if (i < argc-1) /* arg count OK? */ |
492 |
goto userr; |
493 |
/* open input file? */ |
494 |
if (i == argc-1 && freopen(argv[i], "r", stdin) == NULL) { |
495 |
fprintf(stderr, "%s: cannot open for reading\n", argv[i]); |
496 |
return(1); |
497 |
} |
498 |
if (record_width < 0) { |
499 |
SET_FILE_BINARY(stdin); |
500 |
SET_FILE_BINARY(stdout); |
501 |
} |
502 |
/* check for no-op */ |
503 |
if (!transpose && (record_width < 0 || |
504 |
(no_columns == ni_columns) & (no_rows == ni_rows))) { |
505 |
if (warnings) |
506 |
fprintf(stderr, "%s: no-op -- copying input verbatim\n", |
507 |
argv[0]); |
508 |
if (!output_stream(stdin)) |
509 |
return(1); |
510 |
return(0); |
511 |
} |
512 |
if (do_header) { /* read/write header */ |
513 |
if (getheader(stdin, &headline, NULL) < 0) |
514 |
return(1); |
515 |
printargs(argc, argv, stdout); |
516 |
fputformat(fmtid, stdout); |
517 |
fputc('\n', stdout); /* finish new header */ |
518 |
} |
519 |
if (transpose) { /* transposing rows & columns? */ |
520 |
MEMLOAD myMem; /* need to load into memory */ |
521 |
if (i == argc-1) { |
522 |
if (load_file(&myMem, stdin) <= 0) { |
523 |
fprintf(stderr, "%s: error loading file into memory\n", |
524 |
argv[i]); |
525 |
return(1); |
526 |
} |
527 |
} else if (load_stream(&myMem, stdin) <= 0) { |
528 |
fprintf(stderr, "%s: error loading stdin into memory\n", |
529 |
argv[0]); |
530 |
return(1); |
531 |
} |
532 |
if (!do_transpose(&myMem)) |
533 |
return(1); |
534 |
/* free_load(&myMem); */ |
535 |
} else if (!do_resize(stdin)) /* just reshaping input */ |
536 |
return(1); |
537 |
return(0); |
538 |
userr: |
539 |
fprintf(stderr, |
540 |
"Usage: %s [-h][-w][-f[afdb][N]][-t][-ic in_col][-ir in_row][-oc out_col][-or out_row] [input.dat]\n", |
541 |
argv[0]); |
542 |
return(1); |
543 |
} |