#include <stdio.h>
#include <string.h>

typedef unsigned char uint8_t;

bool HasUtf8Bom(const uint8_t* src, const size_t length) {
 const uint8_t UTF8_BOM[] = { 0xEF, 0xBB, 0xBF }; 
 if ((src == NULL) || (length < 3)) {
   return false;
 }

 return (memcmp(src, UTF8_BOM, sizeof(UTF8_BOM)) == 0);
}

int CountUtf8Characters(const uint8_t* src, size_t* byteLength) {
  int count = 0;
  size_t i = 0;

  if ((src == NULL) || (byteLength == NULL)) {
    return -1;
  }

  if (HasUtf8Bom(src, *byteLength)) {
    i += 3;
  }

  while (i < *byteLength) {
    uint8_t firstByte = src[i];
    if (firstByte == 0) {
      break;
    }
    if (firstByte <= 0x7F) {
      i++; // 1バイト
    } else {
      size_t charLength = 0;
      // マルチバイト
      if ((0xC2 <= firstByte) && (firstByte <= 0xDF)) {
        charLength = 2; // 2バイト
      } else if ((0xE0 <= firstByte) && (firstByte <= 0xEF)) {
        charLength = 3; // 3バイト
      } else if ((0xF0 <= firstByte) && (firstByte <= 0xF7)) {
        charLength = 4; // 4バイト
      } else {
        break; // 不正なシーケンス
      }

      // 文字が切れている
      if ((i + charLength) > *byteLength) {
        break;
      }
      i += charLength;
    }
    count++;
  }
  *byteLength = i;
  return count;
}

int main(int argc, char**argv) {
  FILE* fp = NULL;
  if (argc < 2) {
    return 1;
  }
  fp = fopen(argv[1], "rb");
  if (fp == NULL) {
    return 1;
  }

  const size_t BUFFER_LENGTH = 102400;
  uint8_t* buffer = new uint8_t[BUFFER_LENGTH];
  memset(buffer, 0, BUFFER_LENGTH);
  
  size_t length = fread(buffer, 1, BUFFER_LENGTH, fp);
  fclose(fp);

  int utf8Count = CountUtf8Characters(buffer, &length);

  printf("count=%d, seq=%u\n", utf8Count, length);

  delete[] buffer;

  getchar();
  return 0;
}
