有一个字典源文件txt格式的,数据排列格式如下
apple[TAB]苹果
map[TAB]地图
使用C自制的程序提取成为 ["apple"]="苹果",["map"]="地图", 这样的格式
选择UTF-8 编码提取成功后的文件 总有一些是乱码。。。
有谁可以指点下么?
下面是我的c程序。。
代码: 全选
#include <cstdlib>
#include <iostream>
#include <map>
#include <string>
using namespace std;
char ToLower(char ch)
{
switch(ch)
{
case '?:
case '?:
return 'e';
case '?:
case '?:
case '?:
return 's';
case '?:
case '?:
return 'c';
case '?:
case '?:
return 'r';
case '?:
case '?:
case '?:
return 'z';
case '?:
case '?:
return 'y';
case '?:
case '?:
return 'a';
case '?:
case '?:
return 'i';
case '?:
case '?:
return 'e';
case '?:
case '?:
case '?:
return 'u';
case '?:
case '?:
return 't';
case '?:
case '?:
return 'd';
case '?:
case '?:
return 'n';
case '?:
case '?:
return 'o';
case '\"':
return '\'';
default:
return tolower(ch);
}
}
int GetWordCount(const char *p)
{
int result = 0;
while (p = strchr(p, ' '))
{
++result;
while (*p == ' ')
++p;
}
return result + 1;
}
int main(int argc, char *argv[])
{
// system("PAUSE");
FILE *f = fopen("ac.dat", "rt");
if (!f)
{
printf("can't open ac.dat\n");
exit(1);
}
char buffer[1024];
// build a dictionary
typedef std::map<std::string, std::string> dictionary_t;
dictionary_t dictionary;
while (fgets(buffer, sizeof(buffer), f))
{
char *p = strchr(buffer, '\t');
if (!p)
continue;
*p = 0;
char *buf = buffer;
for (char *b = buf; *b; ++b)
*b = ToLower(*b);
if (*buf == 'a' && *(buf + 1) == ' ')
buf += 2;
if (GetWordCount(buf) > 1)
continue;
std::string &value = dictionary[buf];
if (value.size())
value += "; ";
++p;
char *e = strchr(p, '\n');
if (e)
*e = 0;
for (char *b = p; *b; ++b)
*b = ToLower(*b);
value += p;
}
// output lua code
printf("dictionary={");
dictionary_t::const_iterator bi = dictionary.begin(), ei = dictionary.end();
for (; bi != ei; ++bi)
{
printf("[\"%s\"]=\"%s\",", (*bi).first.c_str(), (*bi).second.c_str());
}
printf("}\n");
return EXIT_SUCCESS;
}