diff --git a/src/expand-common.c b/src/expand-common.c index 4657e46..97cbb09 100644 --- a/src/expand-common.c +++ b/src/expand-common.c @@ -18,6 +18,7 @@ #include #include +#include #include "system.h" #include "die.h" #include "error.h" @@ -85,6 +86,119 @@ add_tab_stop (uintmax_t tabval) } } +extern int +set_utf_locale (void) +{ + /*try using some predefined locale */ + const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; + + const int predef_locales_count=3; + for (int i=0;ibufcount=0; + if (c == 0xEF) + { + c=fgetc(fp); + } + else + { + if (c != EOF) + { + ungetc(c,fp); + } + return false; + } + + if (c == 0xBB) + { + c=fgetc(fp); + } + else + { + if ( c!= EOF ) + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->bufcount=1; + ungetc(c,fp); + return false; + } + else + { + ungetc(0xEF,fp); + return false; + } + } + if (c == 0xBF) + { + mbf->bufcount=0; + return true; + } + else + { + if (c != EOF) + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->buf[1]=(unsigned char) 0xBB; + mbf->bufcount=2; + ungetc(c,fp); + return false; + } + else + { + mbf->buf[0]=(unsigned char) 0xEF; + mbf->bufcount=1; + ungetc(0xBB,fp); + return false; + } + } + return false; +} + +extern void +print_bom(void) +{ + putc (0xEF, stdout); + putc (0xBB, stdout); + putc (0xBF, stdout); +} + /* Add the comma or blank separated list of tab stops STOPS to the list of tab stops. */ extern void diff --git a/src/expand-common.h b/src/expand-common.h index 8cb2079..763bfda 100644 --- a/src/expand-common.h +++ b/src/expand-common.h @@ -34,6 +34,18 @@ extern size_t max_column_width; /* The desired exit status. */ extern int exit_status; +extern int +set_utf_locale (void); + +extern bool +check_utf_locale(void); + +extern bool +check_bom(FILE* fp, mb_file_t *mbf); + +extern void +print_bom(void); + /* Add tab stop TABVAL to the end of 'tab_list'. */ extern void add_tab_stop (uintmax_t tabval); diff --git a/src/expand.c b/src/expand.c index 310b349..4136824 100644 --- a/src/expand.c +++ b/src/expand.c @@ -105,11 +105,33 @@ expand (void) FILE *fp = next_file (NULL); mb_file_t mbf; mbf_char_t c; - + /* True if the starting locale is utf8. */ + bool using_utf_locale; + + /* True if the first file contains BOM header. */ + bool found_bom; + using_utf_locale=check_utf_locale(); + if (!fp) return; - mbf_init (mbf, fp); + found_bom=check_bom(fp,&mbf); + + if (using_utf_locale == false && found_bom == true) + { + /*try using some predefined locale */ + + if (set_utf_locale () != 0) + { + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); + } + } + + + if (found_bom == true) + { + print_bom(); + } while (true) { @@ -134,6 +156,27 @@ expand (void) if ((mb_iseof (c)) && (fp = next_file (fp))) { mbf_init (mbf, fp); + if (fp!=NULL) + { + if (check_bom(fp,&mbf)==true) + { + /*Not the first file - check BOM header*/ + if (using_utf_locale==false && found_bom==false) + { + /*BOM header in subsequent file but not in the first one. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + else + { + if(using_utf_locale==false && found_bom==true) + { + /*First file conatined BOM header - locale was switched to UTF + /*all subsequent files should contain BOM. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + } continue; } else diff --git a/src/unexpand.c b/src/unexpand.c index 863a90a..5681b58 100644 --- a/src/unexpand.c +++ b/src/unexpand.c @@ -116,16 +116,36 @@ unexpand (void) include characters other than spaces, so the blanks must be stored, not merely counted. */ mbf_char_t *pending_blank; + /* True if the starting locale is utf8. */ + bool using_utf_locale; + + /* True if the first file contains BOM header. */ + bool found_bom; + using_utf_locale=check_utf_locale(); if (!fp) return; + mbf_init (mbf, fp); + found_bom=check_bom(fp,&mbf); + if (using_utf_locale == false && found_bom == true) + { + /*try using some predefined locale */ + + if (set_utf_locale () != 0) + { + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); + } + } /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); - mbf_init (mbf, fp); + if (found_bom == true) + { + print_bom(); + } while (true) { @@ -169,6 +189,27 @@ unexpand (void) if ((mb_iseof (c)) && (fp = next_file (fp))) { mbf_init (mbf, fp); + if (fp!=NULL) + { + if (check_bom(fp,&mbf)==true) + { + /*Not the first file - check BOM header*/ + if (using_utf_locale==false && found_bom==false) + { + /*BOM header in subsequent file but not in the first one. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + else + { + if(using_utf_locale==false && found_bom==true) + { + /*First file conatined BOM header - locale was switched to UTF + /*all subsequent files should contain BOM. */ + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); + } + } + } continue; } else diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh index 031be7a..1621c84 100755 --- a/tests/expand/mb.sh +++ b/tests/expand/mb.sh @@ -109,4 +109,75 @@ env printf '12345678 expand < in > out || fail=1 compare exp out > /dev/null 2>&1 || fail=1 + + +#BOM header test 1 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + +printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ + + +printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C expand in1 in1 > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + exit $fail diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh index 8d75652..9d4ee3e 100755 --- a/tests/unexpand/mb.sh +++ b/tests/unexpand/mb.sh @@ -111,3 +111,62 @@ env printf '12345678 unexpand -a < in > out || fail=1 compare exp out > /dev/null 2>&1 || fail=1 + +#BOM header test 1 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C unexpand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + + +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + + +unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LANG=C unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +LC_ALL=C unexpand in in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1