本次偶然翻阅网站https://github.com/unxed/oemcp,发现这个解决方案,很不错,其对zip乱码的根本原因进行了解释,并提出了解决方法。
下面针对ubuntu20.10进行操作:Windows store file names in .zip archives using so called OEM code page. That's why you sometimes see wrong characters when trying to open .zip file. This is well-known issue plaguing open source community, see this issue for example: https://github.com/mate-desktop/engrampa/issues/5
代码: 全选
# 更新软件源并下载源码
sudo apt update
apt source unzip p7zip
# p7zip 版本为16.02+dfsg-8
# unzip 版本为6.0-25ubuntu1
# 安装编译依赖
sudo apt build-dep p7zip unzip
# 为unzip打补丁
cd unzip-6.0
cat > debian/patches/25-unzip_oemcpauto_unix.c.patch << 'EOF'
Index: unzip-6.0/unix/unix.c
===================================================================
--- a/unix/unix.c 2020-10-28 15:38:39.000000000 +0800
+++ b/unix/unix.c 2020-10-28 15:48:44.382126431 +0800
@@ -1879,13 +1879,16 @@
#endif /* QLZIP */
+/*
typedef struct {
char *local_charset;
char *archive_charset;
} CHARSET_MAP;
+*/
/* A mapping of local <-> archive charsets used by default to convert filenames
* of DOS/Windows Zip archives. Currently very basic. */
+/*
static CHARSET_MAP dos_charset_map[] = {
{ "ANSI_X3.4-1968", "CP850" },
{ "ISO-8859-1", "CP850" },
@@ -1895,6 +1898,57 @@
{ "KOI8-U", "CP866" },
{ "ISO-8859-5", "CP866" }
};
+*/
+
+char *lc_to_oem_cp(char *lc) {
+ static char *lc_to_cp_table[] = {
+ "af_ZA", "CP850", "ar_SA", "CP720", "ar_LB", "CP720", "ar_EG", "CP720",
+ "ar_DZ", "CP720", "ar_BH", "CP720", "ar_IQ", "CP720", "ar_JO", "CP720",
+ "ar_KW", "CP720", "ar_LY", "CP720", "ar_MA", "CP720", "ar_OM", "CP720",
+ "ar_QA", "CP720", "ar_SY", "CP720", "ar_TN", "CP720", "ar_AE", "CP720",
+ "ar_YE", "CP720","ast_ES", "CP850", "az_AZ", "CP866", "az_AZ", "CP857",
+ "be_BY", "CP866", "bg_BG", "CP866", "br_FR", "CP850", "ca_ES", "CP850",
+ "zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP850", "cs_CZ", "CP852",
+ "cy_GB", "CP850", "da_DK", "CP850", "de_AT", "CP850", "de_LI", "CP850",
+ "de_LU", "CP850", "de_CH", "CP850", "de_DE", "CP850", "el_GR", "CP737",
+ "en_AU", "CP850", "en_CA", "CP850", "en_GB", "CP850", "en_IE", "CP850",
+ "en_JM", "CP850", "en_BZ", "CP850", "en_PH", "CP437", "en_ZA", "CP437",
+ "en_TT", "CP850", "en_US", "CP437", "en_ZW", "CP437", "en_NZ", "CP850",
+ "es_PA", "CP850", "es_BO", "CP850", "es_CR", "CP850", "es_DO", "CP850",
+ "es_SV", "CP850", "es_EC", "CP850", "es_GT", "CP850", "es_HN", "CP850",
+ "es_NI", "CP850", "es_CL", "CP850", "es_MX", "CP850", "es_ES", "CP850",
+ "es_CO", "CP850", "es_ES", "CP850", "es_PE", "CP850", "es_AR", "CP850",
+ "es_PR", "CP850", "es_VE", "CP850", "es_UY", "CP850", "es_PY", "CP850",
+ "et_EE", "CP775", "eu_ES", "CP850", "fa_IR", "CP720", "fi_FI", "CP850",
+ "fo_FO", "CP850", "fr_FR", "CP850", "fr_BE", "CP850", "fr_CA", "CP850",
+ "fr_LU", "CP850", "fr_MC", "CP850", "fr_CH", "CP850", "ga_IE", "CP437",
+ "gd_GB", "CP850", "gv_IM", "CP850", "gl_ES", "CP850", "he_IL", "CP862",
+ "hr_HR", "CP852", "hu_HU", "CP852", "id_ID", "CP850", "is_IS", "CP850",
+ "it_IT", "CP850", "it_CH", "CP850", "iv_IV", "CP437", "ja_JP", "CP932",
+ "kk_KZ", "CP866", "ko_KR", "CP949", "ky_KG", "CP866", "lt_LT", "CP775",
+ "lv_LV", "CP775", "mk_MK", "CP866", "mn_MN", "CP866", "ms_BN", "CP850",
+ "ms_MY", "CP850", "nl_BE", "CP850", "nl_NL", "CP850", "nl_SR", "CP850",
+ "nn_NO", "CP850", "nb_NO", "CP850", "pl_PL", "CP852", "pt_BR", "CP850",
+ "pt_PT", "CP850", "rm_CH", "CP850", "ro_RO", "CP852", "ru_RU", "CP866",
+ "sk_SK", "CP852", "sl_SI", "CP852", "sq_AL", "CP852", "sr_RS", "CP855",
+ "sr_RS", "CP852", "sv_SE", "CP850", "sv_FI", "CP850", "sw_KE", "CP437",
+ "th_TH", "CP874", "tr_TR", "CP857", "tt_RU", "CP866", "uk_UA", "CP866",
+ "ur_PK", "CP720", "uz_UZ", "CP866", "uz_UZ", "CP857", "vi_VN", "CP1258",
+ "wa_BE", "CP850", "zh_HK", "CP950", "zh_SG", "CP936"};
+ int table_len = sizeof(lc_to_cp_table) / sizeof(char *);
+ int lc_len, i;
+
+ if (lc && lc[0]) {
+ // Compare up to the dot, if it exists, e.g. en_US.UTF-8
+ for (lc_len = 0; lc[lc_len] != '.' && lc[lc_len] != '\0'; ++lc_len)
+ ;
+ for (i = 0; i < table_len; i += 2)
+ if (strncmp(lc, lc_to_cp_table[i], lc_len) == 0)
+ return lc_to_cp_table[i + 1];
+ }
+
+ return "CP437";
+}
char OEM_CP[MAX_CP_NAME] = "";
char ISO_CP[MAX_CP_NAME] = "";
@@ -1903,10 +1957,20 @@
* ISO_CP is left alone for now. */
void init_conversion_charsets()
{
+ char *oemcp;
+ oemcp = getenv("OEMCP");
+ if (!oemcp) {
+ oemcp = lc_to_oem_cp(setlocale(LC_CTYPE, ""));
+ }
+ strncpy(OEM_CP, oemcp, strlen(oemcp));
+
+ /*
const char *local_charset;
int i;
+ */
/* Make a guess only if OEM_CP not already set. */
+ /*
if(*OEM_CP == '\0') {
local_charset = nl_langinfo(CODESET);
for(i = 0; i < sizeof(dos_charset_map)/sizeof(CHARSET_MAP); i++)
@@ -1916,6 +1980,7 @@
break;
}
}
+ */
}
/* Convert a string from one encoding to the current locale using iconv().
EOF
echo "25-unzip_oemcpauto_unix.c.patch" >> debian/patches/series
tar Jcvf ../unzip_*.debian.tar.xz debian/
# 开始编译unzip
dpkg-buildpackage
# 为p7zip打补丁
cd ../p7zip-16.02+dfsg
cat > debian/patches/16-oemcp_ZipItem.cpp.patch << 'EOF'
Index: p7zip-16.02+dfsg/CPP/7zip/Archive/Zip/ZipItem.cpp
===================================================================
--- a/CPP/7zip/Archive/Zip/ZipItem.cpp 2020-10-28 15:38:39.000000000 +0800
+++ b/CPP/7zip/Archive/Zip/ZipItem.cpp 2020-10-28 15:48:44.382126431 +0800
@@ -1,5 +1,10 @@
// Archive/ZipItem.cpp
+#ifndef _WIN32
+#include <iconv.h>
+#include <locale.h>
+#endif
+
#include "StdAfx.h"
#include "../../../../C/CpuArch.h"
@@ -244,6 +249,86 @@
#endif
}
+ #ifndef _WIN32
+ // Convert OEM char set to UTF-8 if needed
+ // Use system locale to select code page
+
+ Byte hostOS = GetHostOS();
+ if (!isUtf8 && ((hostOS == NFileHeader::NHostOS::kFAT) || (hostOS == NFileHeader::NHostOS::kNTFS))) {
+
+ const char *oemcp;
+ oemcp = getenv("OEMCP");
+ if (!oemcp) {
+ oemcp = "CP437\0"; // CP name is 6 chars max
+
+ const char *lc_to_cp_table[] = {
+ "af_ZA", "CP850", "ar_SA", "CP720", "ar_LB", "CP720", "ar_EG", "CP720",
+ "ar_DZ", "CP720", "ar_BH", "CP720", "ar_IQ", "CP720", "ar_JO", "CP720",
+ "ar_KW", "CP720", "ar_LY", "CP720", "ar_MA", "CP720", "ar_OM", "CP720",
+ "ar_QA", "CP720", "ar_SY", "CP720", "ar_TN", "CP720", "ar_AE", "CP720",
+ "ar_YE", "CP720","ast_ES", "CP850", "az_AZ", "CP866", "az_AZ", "CP857",
+ "be_BY", "CP866", "bg_BG", "CP866", "br_FR", "CP850", "ca_ES", "CP850",
+ "zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP850", "cs_CZ", "CP852",
+ "cy_GB", "CP850", "da_DK", "CP850", "de_AT", "CP850", "de_LI", "CP850",
+ "de_LU", "CP850", "de_CH", "CP850", "de_DE", "CP850", "el_GR", "CP737",
+ "en_AU", "CP850", "en_CA", "CP850", "en_GB", "CP850", "en_IE", "CP850",
+ "en_JM", "CP850", "en_BZ", "CP850", "en_PH", "CP437", "en_ZA", "CP437",
+ "en_TT", "CP850", "en_US", "CP437", "en_ZW", "CP437", "en_NZ", "CP850",
+ "es_PA", "CP850", "es_BO", "CP850", "es_CR", "CP850", "es_DO", "CP850",
+ "es_SV", "CP850", "es_EC", "CP850", "es_GT", "CP850", "es_HN", "CP850",
+ "es_NI", "CP850", "es_CL", "CP850", "es_MX", "CP850", "es_ES", "CP850",
+ "es_CO", "CP850", "es_ES", "CP850", "es_PE", "CP850", "es_AR", "CP850",
+ "es_PR", "CP850", "es_VE", "CP850", "es_UY", "CP850", "es_PY", "CP850",
+ "et_EE", "CP775", "eu_ES", "CP850", "fa_IR", "CP720", "fi_FI", "CP850",
+ "fo_FO", "CP850", "fr_FR", "CP850", "fr_BE", "CP850", "fr_CA", "CP850",
+ "fr_LU", "CP850", "fr_MC", "CP850", "fr_CH", "CP850", "ga_IE", "CP437",
+ "gd_GB", "CP850", "gv_IM", "CP850", "gl_ES", "CP850", "he_IL", "CP862",
+ "hr_HR", "CP852", "hu_HU", "CP852", "id_ID", "CP850", "is_IS", "CP850",
+ "it_IT", "CP850", "it_CH", "CP850", "iv_IV", "CP437", "ja_JP", "CP932",
+ "kk_KZ", "CP866", "ko_KR", "CP949", "ky_KG", "CP866", "lt_LT", "CP775",
+ "lv_LV", "CP775", "mk_MK", "CP866", "mn_MN", "CP866", "ms_BN", "CP850",
+ "ms_MY", "CP850", "nl_BE", "CP850", "nl_NL", "CP850", "nl_SR", "CP850",
+ "nn_NO", "CP850", "nb_NO", "CP850", "pl_PL", "CP852", "pt_BR", "CP850",
+ "pt_PT", "CP850", "rm_CH", "CP850", "ro_RO", "CP852", "ru_RU", "CP866",
+ "sk_SK", "CP852", "sl_SI", "CP852", "sq_AL", "CP852", "sr_RS", "CP855",
+ "sr_RS", "CP852", "sv_SE", "CP850", "sv_FI", "CP850", "sw_KE", "CP437",
+ "th_TH", "CP874", "tr_TR", "CP857", "tt_RU", "CP866", "uk_UA", "CP866",
+ "ur_PK", "CP720", "uz_UZ", "CP866", "uz_UZ", "CP857", "vi_VN", "CP1258",
+ "wa_BE", "CP850", "zh_HK", "CP950", "zh_SG", "CP936"};
+ int table_len = sizeof(lc_to_cp_table) / sizeof(char *);
+ int lc_len, i;
+
+ char *lc = setlocale(LC_CTYPE, "");
+
+ if (lc && lc[0]) {
+ // Compare up to the dot, if it exists, e.g. en_US.UTF-8
+ for (lc_len = 0; lc[lc_len] != '.' && lc[lc_len] != '\0'; ++lc_len)
+ ;
+ for (i = 0; i < table_len; i += 2)
+ if (strncmp(lc, lc_to_cp_table[i], lc_len) == 0)
+ oemcp = lc_to_cp_table[i + 1];
+ }
+ }
+
+ iconv_t cd;
+ if ((cd = iconv_open("UTF-8", oemcp)) != (iconv_t)-1) {
+
+ AString s_utf8;
+ const char* src = s.Ptr();
+ size_t slen = s.Len();
+ size_t dlen = slen * 4;
+ const char* dest = s_utf8.GetBuf_SetEnd(dlen + 1); // (source length * 4) + null termination
+
+ size_t done = iconv(cd, (char**)&src, &slen, (char**)&dest, &dlen);
+ bzero((size_t*)dest + done, 1);
+
+ iconv_close(cd);
+
+ if (ConvertUTF8ToUnicode(s_utf8, res) || ignore_Utf8_Errors)
+ return;
+ }
+ }
+ #endif
if (isUtf8)
if (ConvertUTF8ToUnicode(s, res) || ignore_Utf8_Errors)
EOF
echo "16-oemcp_ZipItem.cpp.patch" >> debian/patches/series
tar Jcvf ../p7zip_*.debian.tar.xz debian/
# 开始编译p7zip
dpkg-buildpackage
# 安装
sudo dpkg -i ../unzip_6.0-25ubuntu1_amd64.deb
sudo dpkg -i ../p7zip_16.02+dfsg-8_amd64.deb
sudo dpkg -i ../p7zip-full_16.02+dfsg-8_amd64.deb
下面是已经制作好的补丁文件(补丁文件下载后取消.txt后缀),仅建议为p7zip打补丁。