VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > Python基础教程 >
  • C#教程之如何检测或判断一个文件或字节流(无(3)

if (le16 > 0 && be16 > 0) { return Encoding.None; } } if (le16 > 0) { if (le16 == le32 && buffer.Length % 4 == 0) { return Encoding.Utf32NoBom; } return Encoding.UnicodeNoBom; } else if (be16 > 0) { return Encoding.BigEndianUnicodeNoBom; } else if (buffer.Length % 4 == 0 && zeroCount >= buffer.Length / 4) { return Encoding.Utf32NoBom; } return Encoding.None; } /// <summary> /// Checks if a buffer contains any nulls. Used to check for binary vs text data. /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> private static bool ContainsZero(byte[] buffer, int size) { uint pos = 0; while (pos < size) { if (buffer[pos++] == 0) { return true; } } return false; } /// <summary> /// Checks if a buffer contains text that looks like utf16. This is done based /// on the use of nulls which in ASCII/script like text can be useful to identify. /// 按照一定的空0数的概率来预测。 /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns> private Encoding CheckByZeroNumPercent(byte[] buffer, int size) { //单数 int oddZeroCount = 0; //双数 int evenZeroCount = 0; // Get even nulls uint pos = 0; while (pos < size) { if (buffer[pos] == 0) { evenZeroCount++; } pos += 2; } // Get odd nulls pos = 1; while (pos < size) { if (buffer[pos] == 0) { oddZeroCount++; } pos += 2; } double evenZeroPercent = evenZeroCount * 2.0 / size; double oddZeroPercent = oddZeroCount * 2.0 / size; // Lots of odd nulls, low number of even nulls 这里的条件做了修改 if (evenZeroPercent < 0.1 && oddZeroPercent > 0) { return Encoding.UnicodeNoBom; } // Lots of even nulls, low number of odd nulls 这里的条件也做了修改 if (oddZeroPercent < 0.1 && evenZeroPercent > 0) { return Encoding.BigEndianUnicodeNoBom; } // Don't know return Encoding.None; } /// <summary> /// Checks if a buffer contains valid utf8. /// 以UTF8 的字节范围来检测。 /// </summary> /// <param name="buffer">The byte buffer.</param> /// <param name="size">The size of the byte buffer.</param> /// <returns> /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or /// Encoding.ASCII (data in 0.127 range). /// </returns> /// <returns>2</returns> private Encoding CheckUtf8(byte[] buffer, int size) { // UTF8 Valid sequences // 0xxxxxxx ASCII // 110xxxxx 10xxxxxx 2-byte // 1110xxxx 10xxxxxx 10xxxxxx 3-byte // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte // // Width in UTF8 // Decimal Width // 0-127 1 byte // 194-223 2 bytes // 224-239 3 bytes // 240-244 4 bytes // // Subsequent chars are in the range 128-191 bool onlySawAsciiRange = true; uint pos = 0; while (pos < size) { byte ch = buffer[pos++]; if (ch == 0) { return Encoding.None; } int moreChars; if (ch <= 127) { // 1 byte moreChars = 0; } else if (ch >= 194 && ch <= 223) { // 2 Byte moreChars = 1; } else if (ch >= 224 && ch <= 239) { // 3 Byte moreChars = 2; } else if (ch >= 240 && ch <= 244) { // 4 Byte moreChars = 3; } else { return Encoding.None; // Not utf8 } // Check secondary chars are in range if we are expecting any while (moreChars > 0 && pos < size) { onlySawAsciiRange = false; // Seen non-ascii chars now ch = buffer[pos++]; if (ch < 128 || ch > 191) { return Encoding.None; // Not utf8 } --moreChars; } } // If we get to here then only valid UTF-8 sequences have been processed // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide) return onlySawAsciiRange ? Encoding.Ascii : Encoding.Utf8Nobom; } /// <summary> /// 是否中文编码(GB2312、GBK、Big5) /// </summary> private void CheckChinese(byte[] buffer, int size) { IsChinese = false; if (size < 2) { return; } // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes size--; uint pos = 0; bool isCN = false; while (pos < size) { //GB2312 //0xB0-0xF7(176-247) //0xA0-0xFE(160-254) //GBK //0x81-0xFE(129-254) //0x40-0xFE(64-254) //Big5 //0x81-0xFE(129-255) //0x40-0x7E(64-126) OR 0xA1-0xFE(161-254) byte ch1 = buffer[pos++]; byte ch2 = buffer[pos++]; isCN = (ch1 >= 176 && ch1 <= 247 && ch2 >= 160 && ch2 <= 254) || (ch1 >= 129 && ch1 <= 254 && ch2 >= 64 && ch2 <= 254) || (ch1 >= 129 && ((ch2 >= 64 && ch2 <= 126) || (ch2 >= 161 && ch2 <= 254))); if (!isCN) { return; } } IsChinese = true; } } }
复制代码

后续更新地址:https://github.com/cyq1162/cyqdata/blob/master/Tool/IOHelper.cs

总结:

1、考虑到UTF7已经过时了,所以直接无视了。

2、对于纯中文情况,UTF16下是BE还是LE,暂时没有想到好的检测方法,所以默认返回了常用的LE,即Unicode。

3、其它一切都安好,全国公开的C#版本,应该就此一份了。


相关教程
关于我们--广告服务--免责声明--本站帮助-友情链接--版权声明--联系我们       黑ICP备07002182号