1 |
#ifndef lint |
2 |
static const char RCSid[] = "$Id: rcollate.c,v 2.1 2013/09/05 17:53:23 greg Exp $"; |
3 |
#endif |
4 |
/* |
5 |
* Utility to re-order records in a binary or ASCII data file (matrix) |
6 |
*/ |
7 |
|
8 |
#include <stdlib.h> |
9 |
#include <unistd.h> |
10 |
#include <string.h> |
11 |
#include <ctype.h> |
12 |
#include "platform.h" |
13 |
#include "rtio.h" |
14 |
#include "resolu.h" |
15 |
#ifndef _WIN32 |
16 |
#include <sys/mman.h> |
17 |
#endif |
18 |
|
19 |
#ifdef getc_unlocked /* avoid horrendous overhead of flockfile */ |
20 |
#undef getc |
21 |
#undef putc |
22 |
#define getc getc_unlocked |
23 |
#define putc putc_unlocked |
24 |
#endif |
25 |
|
26 |
typedef struct { |
27 |
void *base; /* pointer to base memory */ |
28 |
size_t len; /* allocated memory length */ |
29 |
int mapped; /* memory-mapped file? */ |
30 |
} MEMLOAD; /* file loaded/mapped into memory */ |
31 |
|
32 |
typedef struct { |
33 |
int nw_rec; /* number of words per record */ |
34 |
int nrecs; /* number of records we found */ |
35 |
char *rec[1]; /* record array (extends struct) */ |
36 |
} RECINDEX; |
37 |
|
38 |
/* free loaded file */ |
39 |
static void |
40 |
free_load(MEMLOAD *mp) |
41 |
{ |
42 |
if (mp == NULL || (mp->base == NULL) | (mp->len <= 0)) |
43 |
return; |
44 |
#ifdef MAP_FILE |
45 |
if (mp->mapped) |
46 |
munmap(mp->base, mp->len); |
47 |
else |
48 |
#endif |
49 |
free(mp->base); |
50 |
mp->base = NULL; |
51 |
mp->len = 0; |
52 |
} |
53 |
|
54 |
/* load a file into memory */ |
55 |
static int |
56 |
load_file(MEMLOAD *mp, FILE *fp) |
57 |
{ |
58 |
int fd; |
59 |
off_t skip, flen; |
60 |
|
61 |
if (mp == NULL) |
62 |
return(-1); |
63 |
mp->base = NULL; |
64 |
mp->len = 0; |
65 |
mp->mapped = 0; |
66 |
if (fp == NULL) |
67 |
return(-1); |
68 |
fd = fileno(fp); |
69 |
skip = ftello(fp); |
70 |
flen = lseek(fd, 0, SEEK_END); |
71 |
if (flen <= skip) |
72 |
return((int)(flen - skip)); |
73 |
mp->len = (size_t)(flen - skip); |
74 |
#ifdef MAP_FILE |
75 |
if (mp->len > 1L<<20) { /* map file if > 1 MByte */ |
76 |
mp->base = mmap(NULL, mp->len, PROT_READ, MAP_PRIVATE, fd, skip); |
77 |
if (mp->base != MAP_FAILED) { |
78 |
mp->mapped = 1; |
79 |
return(1); /* mmap() success */ |
80 |
} |
81 |
mp->base = NULL; /* fall back to reading it in... */ |
82 |
} |
83 |
#endif |
84 |
if (lseek(fd, skip, SEEK_SET) != skip || |
85 |
(mp->base = malloc(mp->len)) == NULL) { |
86 |
mp->len = 0; |
87 |
return(-1); |
88 |
} |
89 |
if (read(fd, (char *)mp->base, mp->len) != mp->len) { |
90 |
free_load(mp); |
91 |
return(-1); |
92 |
} |
93 |
return(1); |
94 |
} |
95 |
|
96 |
/* load memory from an input stream, starting from current position */ |
97 |
static int |
98 |
load_stream(MEMLOAD *mp, FILE *fp) |
99 |
{ |
100 |
char buf[8192]; |
101 |
size_t nr; |
102 |
|
103 |
if (mp == NULL) |
104 |
return(-1); |
105 |
mp->base = NULL; |
106 |
mp->len = 0; |
107 |
mp->mapped = 0; |
108 |
if (fp == NULL) |
109 |
return(-1); |
110 |
while ((nr = fread(buf, 1, sizeof(buf), fp)) > 0) { |
111 |
if (!mp->len) |
112 |
mp->base = malloc(nr); |
113 |
else |
114 |
mp->base = realloc(mp->base, mp->len+nr); |
115 |
if (mp->base == NULL) |
116 |
return(-1); |
117 |
memcpy((char *)mp->base + mp->len, buf, nr); |
118 |
mp->len += nr; |
119 |
} |
120 |
if (ferror(fp)) { |
121 |
free_load(mp); |
122 |
return(-1); |
123 |
} |
124 |
return(mp->len > 0); |
125 |
} |
126 |
|
127 |
/* free a record index */ |
128 |
#define free_records(rp) free(rp) |
129 |
|
130 |
/* compute record index */ |
131 |
static RECINDEX * |
132 |
index_records(const MEMLOAD *mp, int nw_rec) |
133 |
{ |
134 |
RECINDEX *rp; |
135 |
char *cp, *mend; |
136 |
int n; |
137 |
|
138 |
if (mp == NULL || (mp->base == NULL) | (mp->len <= 0)) |
139 |
return(NULL); |
140 |
if (nw_rec <= 0) |
141 |
return(NULL); |
142 |
rp = (RECINDEX *)malloc(sizeof(RECINDEX) + mp->len/(2*nw_rec)*sizeof(char *)); |
143 |
if (rp == NULL) |
144 |
return(NULL); |
145 |
rp->nw_rec = nw_rec; |
146 |
rp->nrecs = 0; |
147 |
cp = (char *)mp->base; |
148 |
mend = cp + mp->len; |
149 |
for ( ; ; ) { /* whitespace-separated words */ |
150 |
while (cp < mend && !*cp | isspace(*cp)) |
151 |
++cp; |
152 |
if (cp >= mend) |
153 |
break; |
154 |
rp->rec[rp->nrecs++] = cp; /* point to first non-white */ |
155 |
n = rp->nw_rec; |
156 |
while (++cp < mend) /* find end of record */ |
157 |
if (!*cp | isspace(*cp)) { |
158 |
if (--n <= 0) |
159 |
break; /* got requisite # words */ |
160 |
do { /* else find next word */ |
161 |
if (*cp == '\n') { |
162 |
fprintf(stderr, |
163 |
"Unexpected EOL in record!\n"); |
164 |
free_records(rp); |
165 |
return(NULL); |
166 |
} |
167 |
if (++cp >= mend) |
168 |
break; |
169 |
} while (!*cp | isspace(*cp)); |
170 |
} |
171 |
} |
172 |
rp->rec[rp->nrecs] = mend; /* reallocate to save space */ |
173 |
rp = (RECINDEX *)realloc(rp, |
174 |
sizeof(RECINDEX) + rp->nrecs*sizeof(char *)); |
175 |
return(rp); |
176 |
} |
177 |
|
178 |
/* count number of columns based on first EOL */ |
179 |
static int |
180 |
count_columns(const RECINDEX *rp) |
181 |
{ |
182 |
char *cp = rp->rec[0]; |
183 |
char *mend = rp->rec[rp->nrecs]; |
184 |
int i; |
185 |
|
186 |
while (*cp != '\n') |
187 |
if (++cp >= mend) |
188 |
return(0); |
189 |
for (i = 0; i < rp->nrecs; i++) |
190 |
if (rp->rec[i] >= cp) |
191 |
break; |
192 |
return(i); |
193 |
} |
194 |
|
195 |
/* copy nth record from index to stdout */ |
196 |
static int |
197 |
print_record(const RECINDEX *rp, int n) |
198 |
{ |
199 |
int words2go = rp->nw_rec; |
200 |
char *scp; |
201 |
|
202 |
if ((n < 0) | (n >= rp->nrecs)) |
203 |
return(0); |
204 |
scp = rp->rec[n]; |
205 |
do { |
206 |
putc(*scp++, stdout); |
207 |
if (!*scp | isspace(*scp)) { |
208 |
if (--words2go <= 0) |
209 |
break; |
210 |
putc(' ', stdout); /* single space btwn. words */ |
211 |
do |
212 |
if (++scp >= rp->rec[n+1]) |
213 |
break; |
214 |
while (!*scp | isspace(*scp)); |
215 |
} |
216 |
} while (scp < rp->rec[n+1]); |
217 |
/* caller adds record sep. */ |
218 |
return(1); |
219 |
} |
220 |
|
221 |
/* copy a stream to stdout */ |
222 |
static int |
223 |
output_stream(FILE *fp) |
224 |
{ |
225 |
char buf[8192]; |
226 |
ssize_t n; |
227 |
|
228 |
if (fp == NULL) |
229 |
return(0); |
230 |
fflush(stdout); /* assumes nothing in input buffer */ |
231 |
while ((n = read(fileno(fp), buf, sizeof(buf))) > 0) |
232 |
if (write(fileno(stdout), buf, n) != n) |
233 |
return(0); |
234 |
return(n >= 0); |
235 |
} |
236 |
|
237 |
/* get next word from stream, leaving stream on EOL or start of next word */ |
238 |
static char * |
239 |
fget_word(char buf[256], FILE *fp) |
240 |
{ |
241 |
int c; |
242 |
char *cp; |
243 |
/* skip nul's and white space */ |
244 |
while (!(c = getc(fp)) || isspace(c)) |
245 |
; |
246 |
if (c == EOF) |
247 |
return(NULL); |
248 |
cp = buf; |
249 |
do |
250 |
*cp++ = c; |
251 |
while ((c = getc(fp)) != EOF && !isspace(c) && cp < buf+255); |
252 |
*cp = '\0'; |
253 |
while (isspace(c) & (c != '\n')) |
254 |
c = getc(fp); |
255 |
if (c != EOF) |
256 |
ungetc(c, fp); |
257 |
return(buf); |
258 |
} |
259 |
|
260 |
char *fmtid = "ascii"; /* format id */ |
261 |
int record_width = 3; /* words/record (<0 binary) */ |
262 |
int ni_columns = 0; /* number of input columns */ |
263 |
int ni_rows = 0; /* number of input rows */ |
264 |
int no_columns = 0; /* number of output columns */ |
265 |
int no_rows = 0; /* number of output rows */ |
266 |
|
267 |
/* output transposed ASCII or binary data from memory */ |
268 |
static int |
269 |
do_transpose(const MEMLOAD *mp) |
270 |
{ |
271 |
static const char tabEOL[2] = {'\t','\n'}; |
272 |
RECINDEX *rp = NULL; |
273 |
long nrecords; |
274 |
int i, j; |
275 |
/* get # records (& index) */ |
276 |
if (record_width > 0) { |
277 |
if ((rp = index_records(mp, record_width)) == NULL) |
278 |
return(0); |
279 |
if (ni_columns <= 0) |
280 |
ni_columns = count_columns(rp); |
281 |
nrecords = rp->nrecs; |
282 |
} else if ((ni_rows > 0) & (ni_columns > 0)) |
283 |
nrecords = ni_rows*ni_columns; |
284 |
else |
285 |
nrecords = mp->len / -record_width; |
286 |
/* check sizes */ |
287 |
if (ni_rows <= 0) |
288 |
ni_rows = no_columns; |
289 |
if (ni_columns <= 0) |
290 |
ni_columns = no_rows; |
291 |
if ((ni_rows <= 0) & (ni_columns > 0)) |
292 |
ni_rows = nrecords/ni_columns; |
293 |
if ((ni_columns <= 0) & (ni_rows > 0)) |
294 |
ni_columns = nrecords/ni_rows; |
295 |
if (nrecords != ni_rows*ni_columns) |
296 |
goto badspec; |
297 |
if (no_columns <= 0) |
298 |
no_columns = ni_rows; |
299 |
if (no_rows <= 0) |
300 |
no_rows = ni_columns; |
301 |
if ((no_rows != ni_columns) | (no_columns != ni_rows)) |
302 |
goto badspec; |
303 |
/* transpose records */ |
304 |
for (i = 0; i < no_rows; i++) { |
305 |
for (j = 0; j < no_columns; j++) |
306 |
if (rp != NULL) { /* ASCII output */ |
307 |
print_record(rp, j*ni_columns + i); |
308 |
putc(tabEOL[j >= no_columns-1], stdout); |
309 |
} else { /* binary output */ |
310 |
fwrite((char *)mp->base + |
311 |
-record_width*(j*ni_columns + i), |
312 |
-record_width, 1, stdout); |
313 |
} |
314 |
if (ferror(stdout)) { |
315 |
fprintf(stderr, "Error writing to stdout\n"); |
316 |
return(0); |
317 |
} |
318 |
} |
319 |
if (rp != NULL) |
320 |
free_records(rp); |
321 |
return(1); |
322 |
badspec: |
323 |
fprintf(stderr, "Bad transpose specification -- check dimension(s)\n"); |
324 |
return(0); |
325 |
} |
326 |
|
327 |
/* resize ASCII stream input by ignoring EOLs between records */ |
328 |
static int |
329 |
do_resize(FILE *fp) |
330 |
{ |
331 |
long records2go = ni_rows*ni_columns; |
332 |
int columns2go = no_columns; |
333 |
char word[256]; |
334 |
/* sanity checks */ |
335 |
if (record_width <= 0) { |
336 |
fprintf(stderr, "Bad call to do_resize (record_width = %d)\n", |
337 |
record_width); |
338 |
return(0); |
339 |
} |
340 |
if (no_columns <= 0) { |
341 |
fprintf(stderr, "Missing -oc specification\n"); |
342 |
return(0); |
343 |
} |
344 |
if ((records2go <= 0) & (no_rows > 0)) |
345 |
records2go = no_rows*no_columns; |
346 |
else if (no_rows*no_columns != records2go) { |
347 |
fprintf(stderr, |
348 |
"Input and output data sizes disagree (%dx%d != %dx%d)\n", |
349 |
ni_rows, ni_columns, no_rows, no_columns); |
350 |
return(0); |
351 |
} |
352 |
do { /* reshape records */ |
353 |
int n; |
354 |
|
355 |
for (n = record_width; n--; ) { |
356 |
if (fget_word(word, fp) == NULL) { |
357 |
if (records2go > 0 || n < record_width-1) |
358 |
break; |
359 |
goto done; /* normal EOD */ |
360 |
} |
361 |
fputs(word, stdout); |
362 |
if (n) { /* mid-record? */ |
363 |
int c = getc(fp); |
364 |
if ((c == '\n') | (c == EOF)) |
365 |
break; |
366 |
ungetc(c, fp); |
367 |
putc(' ', stdout); |
368 |
} |
369 |
} |
370 |
if (n >= 0) { |
371 |
fprintf(stderr, "Incomplete record / unexpected EOF\n"); |
372 |
return(0); |
373 |
} |
374 |
if (--columns2go <= 0) { /* time to end output row? */ |
375 |
putc('\n', stdout); |
376 |
columns2go = no_columns; |
377 |
} else /* else separate records */ |
378 |
putc('\t', stdout); |
379 |
} while (--records2go); /* expected EOD? */ |
380 |
done: |
381 |
if (columns2go != no_columns) |
382 |
fprintf(stderr, "Warning -- incomplete final row\n"); |
383 |
if (fget_word(word, fp) != NULL) |
384 |
fprintf(stderr, "Warning -- data beyond expected EOF\n"); |
385 |
return(1); |
386 |
} |
387 |
|
388 |
/* process a header line and copy to stdout */ |
389 |
static int |
390 |
headline(char *s, void *p) |
391 |
{ |
392 |
char fmt[32]; |
393 |
|
394 |
if (formatval(fmt, s)) { |
395 |
if (!strcmp(fmt, fmtid)) |
396 |
return(0); |
397 |
fprintf(stderr, "Input format '%s' != '%s'\n", fmt, fmtid); |
398 |
return(-1); |
399 |
} |
400 |
fputs(s, stdout); /* copy header info. */ |
401 |
return(0); |
402 |
} |
403 |
|
404 |
/* main routine for converting rows/columns in data file */ |
405 |
int |
406 |
main(int argc, char *argv[]) |
407 |
{ |
408 |
int do_header = 1; /* header i/o? */ |
409 |
int transpose = 0; /* transpose rows & cols? */ |
410 |
int i; |
411 |
|
412 |
for (i = 1; i < argc && argv[i][0] == '-'; i++) |
413 |
switch (argv[i][1]) { |
414 |
case 'i': /* input */ |
415 |
if (argv[i][2] == 'c') /* columns */ |
416 |
ni_columns = atoi(argv[++i]); |
417 |
else if (argv[i][2] == 'r') |
418 |
ni_rows = atoi(argv[++i]); |
419 |
else |
420 |
goto userr; |
421 |
break; |
422 |
case 'o': /* output */ |
423 |
if (argv[i][2] == 'c') /* columns */ |
424 |
no_columns = atoi(argv[++i]); |
425 |
else if (argv[i][2] == 'r') |
426 |
no_rows = atoi(argv[++i]); |
427 |
else |
428 |
goto userr; |
429 |
break; |
430 |
case 'h': /* header on/off */ |
431 |
do_header = !do_header; |
432 |
break; |
433 |
case 't': /* transpose on/off */ |
434 |
transpose = !transpose; |
435 |
break; |
436 |
case 'f': /* format */ |
437 |
switch (argv[i][2]) { |
438 |
case 'a': /* ASCII */ |
439 |
case 'A': |
440 |
fmtid = "ascii"; |
441 |
record_width = 1; |
442 |
break; |
443 |
case 'f': /* float */ |
444 |
case 'F': |
445 |
fmtid = "float"; |
446 |
record_width = -(int)sizeof(float); |
447 |
break; |
448 |
case 'd': /* double */ |
449 |
case 'D': |
450 |
fmtid = "double"; |
451 |
record_width = -(int)sizeof(double); |
452 |
break; |
453 |
case 'b': /* binary (bytes) */ |
454 |
case 'B': |
455 |
fmtid = "byte"; |
456 |
record_width = -1; |
457 |
break; |
458 |
default: |
459 |
goto userr; |
460 |
} |
461 |
if (argv[i][3]) { |
462 |
if (!isdigit(argv[i][3])) |
463 |
goto userr; |
464 |
record_width *= atoi(argv[i]+3); |
465 |
} |
466 |
break; |
467 |
default: |
468 |
goto userr; |
469 |
} |
470 |
if (!record_width) |
471 |
goto userr; |
472 |
if (i < argc-1) /* arg count OK? */ |
473 |
goto userr; |
474 |
/* open input file? */ |
475 |
if (i == argc-1 && freopen(argv[i], "r", stdin) == NULL) { |
476 |
fprintf(stderr, "%s: cannot open for reading\n", argv[i]); |
477 |
return(1); |
478 |
} |
479 |
if (record_width < 0) { |
480 |
SET_FILE_BINARY(stdin); |
481 |
SET_FILE_BINARY(stdout); |
482 |
} |
483 |
/* check for no-op */ |
484 |
if (!transpose && (record_width < 0 || |
485 |
(no_columns == ni_columns) & (no_rows == ni_rows))) { |
486 |
fprintf(stderr, "%s: no-op -- copying input verbatim\n", |
487 |
argv[0]); |
488 |
if (!output_stream(stdin)) |
489 |
return(1); |
490 |
return(0); |
491 |
} |
492 |
if (do_header) { /* read/write header */ |
493 |
if (getheader(stdin, &headline, NULL) < 0) |
494 |
return(1); |
495 |
printargs(argc, argv, stdout); |
496 |
fputformat(fmtid, stdout); |
497 |
fputc('\n', stdout); /* finish new header */ |
498 |
} |
499 |
if (transpose) { /* transposing rows & columns? */ |
500 |
MEMLOAD myMem; /* need to load into memory */ |
501 |
if (i == argc-1) { |
502 |
if (load_file(&myMem, stdin) <= 0) { |
503 |
fprintf(stderr, "%s: error loading file into memory\n", |
504 |
argv[i]); |
505 |
return(1); |
506 |
} |
507 |
} else if (load_stream(&myMem, stdin) <= 0) { |
508 |
fprintf(stderr, "%s: error loading stdin into memory\n", |
509 |
argv[0]); |
510 |
return(1); |
511 |
} |
512 |
if (!do_transpose(&myMem)) |
513 |
return(1); |
514 |
/* free_load(&myMem); */ |
515 |
} else if (!do_resize(stdin)) /* just reshaping input */ |
516 |
return(1); |
517 |
return(0); |
518 |
userr: |
519 |
fprintf(stderr, |
520 |
"Usage: %s [-h][-f[afdb][N]][-t][-ic in_col][-ir in_row][-oc out_col][-or out_row] [input.dat]\n", |
521 |
argv[0]); |
522 |
return(1); |
523 |
} |