[a784c9]: mcutfsig.c Maximize Restore History

Download this file

mcutfsig.c    119 lines (114 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*
* mcutfsig.c
*
* $Id$
*
* Copyright (C) 2007, Keith Marshall
*
* This file implements the `mc_utf_signature' function, which is used
* by `gencat', to identify message definition source files which appear
* to exhibit any recognisable standard of Unicode encoding.
*
* Written by Keith Marshall <keithmarshall@users.sourceforge.net>
* Last Revision: 22-May-2007
*
*
* This is free software. It is provided AS IS, in the hope that it may
* be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
* of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
*
* Permission is granted to redistribute this software, either "as is" or
* in modified form, under the terms of the GNU General Public License, as
* published by the Free Software Foundation; either version 2, or (at your
* option) any later version.
*
* You should have received a copy of the GNU General Public License
* along with this software; see the file COPYING. If not, write to the
* Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
* MA 02110-1301, USA.
*
*/
#include <mcutfsig.h>
unsigned short mc_utf_signature( unsigned char *stream )
{
/* Inspect the first few bytes of the specified data stream;
* attempt to identify a potential Unicode encoding signature,
* defaulting to non-specific single byte encoding units.
*/
unsigned short signature = 1;
/*
* The first character in the input stream must not be NUL,
* and must be a member of the POSIX Portable Character Set;
* if it isn't, then it may indicate a Unicode stream.
*/
if( *stream == 0 )
{
/* An initial NUL byte anticipates a big-endian Unicode stream;
* one such byte implies UTF-16, without a Byte Order Mark, while
* two such followed by the big-endian form of the BOM, or three
* without a BOM, indicates UTF-32.
*/
int count = 4;
while( count-- && (*stream++ == '\0') )
++signature;
signature += UTF_BIG_ENDIAN;
}
if( (*stream & 0xfe) == 0xfe )
{
/* This looks like it might be a Unicode Byte Order Mark;
* identify the UTF encoding standard, if any, which it represents.
*/
unsigned bom = *stream++ << 8; bom |= *stream++;
switch( bom )
{
case 0xfffe:
/*
* This is the BOM signature for a little-endian Unicode stream;
* the first byte has already been included in the initial size
* assigned for the encoding unit; adjust this to accommodate the
* second byte, and incorporate the little-endian flag.
*/
signature += UTF_WITH_BYTE_ORDER_MARK + UTF_LITTLE_ENDIAN + 1;
if( *stream == '\0' )
{
int count = 4;
while( count-- && (*stream++ == '\0') )
++signature;
}
break;
case 0xfeff:
/*
* This is the BOM signature for a big-endian Unicode stream;
* if preceded by two NULs, (already counted), then it is UTF-32,
* else it is UTF-16. In either case, adding an additional one
* to the accumulated size of the encoding unit yields the
* desired result, since the first byte of the BOM, and
* any leading NULs, have already been counted.
*/
signature += UTF_WITH_BYTE_ORDER_MARK + UTF_BIG_ENDIAN + 1;
break;
case 0xffbb:
/*
* Provided it's followed by one further `0xbf' byte, this is the
* BOM used as a signature for a UTF-8 encoded stream; it becomes
* invalid, if there were any leading NUL bytes.
*/
if( (signature == 1) && (*stream++ == (unsigned char)('\xbf')) )
signature |= UTF_WITH_BYTE_ORDER_MARK;
}
}
else if( (signature == 1) && (*++stream == 0) )
{
/* NUL as the second byte in the input stream indicates a probable
* little-endian Unicode input stream, although this is not indicated
* by a Byte Order Mark; count the trailing NULs, to determine if we
* should interpret it as UTF-16LE, or as UTF-32LE.
*/
int count = 4;
while( count-- && (*stream++ == '\0') )
++signature;
signature += UTF_LITTLE_ENDIAN;
}
return signature;
}
/* $RCSfile$Revision$: end of file */