#!/usr/bin/python
# coding:utf8
import os
import chardet
import sys
import traceback
import logging
# 遍历文件
def get_all_file_path(path, all_file_path):
"""
:param path: 指定的扫描路径
:param all_file_path: 保存各个文件的路径
:return:
"""
if not os.path.isdir(path):
print "%s该文件路径不存在"%(path)
return []
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
# 递归:判断文件路径是不是文件夹,如果时继续调用该函数
if os.path.isdir(filepath):
get_all_file_path(filepath, all_file_path)
else:
all_file_path.append(filepath)
return all_file_path
# 转码
def imp_file_encode(file_path, final_file_name, target_code):
"""
:param file_path: 要转化的文件名及路径
:param final_file_name: 转化成功的文件保存到指定的文件
:return: boolean
:target_code: 指定的目标编码
"""
try:
# 读文件
file_obj = open(file_path, 'r')
# 获取文件内容
file_content = file_obj.read()
# 判断文件内容的编码格式
file_code = chardet.detect(file_content)
# 解码并转码(必须写解码,才能够转码)
gbk_file_content = file_content.decode(file_code['encoding']).encode(target_code)
file_obj.close()
with open(final_file_name, 'wb') as fp:
fp.write(gbk_file_content)
return True
except Exception:
traceback.print_exc()
return False
# 改进后转码函数,在读取大文件时会正常转码
def file_encode(file_path, final_file_name, target_code):
"""
:param file_path: 要转化的文件名及路径
:param final_file_name: 转化成功的文件保存到指定的文件
:return: boolean
:target_code: 指定的目标编码
"""
try:
# 读文件
file_code = chardet.detect(final_file_name)
# 路径名解码
if file_code['encoding'] == None:
file_code['encoding'] = 'utf-8'
final_file_name = final_file_name.decode(file_code['encoding'], 'ignore').encode(target_code, 'ignore')
target_file_name = final_file_name.split("/")[-1]
target_dir = final_file_name.replace(target_file_name, '')
print target_dir
if not os.path.isdir(target_dir):
try:
os.makedirs(target_dir)
except Exception, e:
print "Can not create dir:", e
file_obj = open(file_path, 'rb')
target_file_obj = open(final_file_name, 'awb')
sequence = 0
while True:
# 获取文件内容 一次读取大概1M的数据量,否则可能造成及其卡顿,影响正常使用
print "正在转化..........", sequence
sequence+=1
file_content = file_obj.read(1000000)
#如果读取的内容为空就终止循环
if file_content == '':
break
# 判断文件内容的编码格式
file_code = chardet.detect(file_content)
# 解码并转码(必须先解码,才能够转码)
if file_code['encoding'] == None:
file_code['encoding'] = 'utf-8'
unicode_file_content = file_content.decode(file_code['encoding'], 'ignore')
target_file_content = unicode_file_content.encode(target_code, 'ignore')
target_file_obj.write(target_file_content)
file_obj.close()
target_file_obj.close()
return True
except Exception:
traceback.print_exc()
return False
# 示例:转化为gbk
if __name__ == '__main__':
if not os.path.isdir('gbk_file_data'):
try:
os.mkdir('gbk_file_data')
except Exception, e:
print "Can not create dir:", e
if len(sys.argv) == 2:
all_file_path = get_all_file_path(sys.argv[1], [])
else:
logging.error("Please input file path!")
exit(1)
for file_path in all_file_path:
if file_encode(file_path, 'gbk_file_data/'+'gbk_'+file_path.split('/')[-1], "gbk"):
print "%s--转码成功"%(file_path)
else:
print "%s--转码失败" % (file_path)