Python 遍历指定文件目录的所有文件并对其进行转码

#!/usr/bin/python
# coding:utf8

import os
import chardet
import sys
import traceback
import logging


# 遍历文件
def get_all_file_path(path, all_file_path):
    """
    :param path: 指定的扫描路径
    :param all_file_path: 保存各个文件的路径
    :return:
    """
    if not os.path.isdir(path):
        print "%s该文件路径不存在"%(path)
        return []
    filelist = os.listdir(path)
    for filename in filelist:
        filepath = os.path.join(path, filename)
        # 递归：判断文件路径是不是文件夹，如果时继续调用该函数
        if os.path.isdir(filepath):
            get_all_file_path(filepath, all_file_path)
        else:
            all_file_path.append(filepath)
    return all_file_path


# 转码
def imp_file_encode(file_path, final_file_name, target_code):
    """
    :param file_path: 要转化的文件名及路径
    :param final_file_name: 转化成功的文件保存到指定的文件
    :return: boolean
    :target_code: 指定的目标编码
    """
    try:
        # 读文件
        file_obj = open(file_path, 'r')
        # 获取文件内容
        file_content = file_obj.read()
        # 判断文件内容的编码格式
        file_code = chardet.detect(file_content)
        # 解码并转码（必须写解码，才能够转码）
        gbk_file_content = file_content.decode(file_code['encoding']).encode(target_code)
　　　　 file_obj.close()
        with open(final_file_name, 'wb') as fp:
            fp.write(gbk_file_content)
        return True
    except Exception:
        traceback.print_exc()
        return False

# 改进后转码函数，在读取大文件时会正常转码
def file_encode(file_path, final_file_name, target_code):
    """
    :param file_path: 要转化的文件名及路径
    :param final_file_name: 转化成功的文件保存到指定的文件
    :return: boolean
    :target_code: 指定的目标编码
    """
    try:
        # 读文件
        file_code = chardet.detect(final_file_name)
        # 路径名解码
        if file_code['encoding'] == None:
            file_code['encoding'] = 'utf-8'
        final_file_name = final_file_name.decode(file_code['encoding'], 'ignore').encode(target_code, 'ignore')
        target_file_name = final_file_name.split("/")[-1]
        target_dir = final_file_name.replace(target_file_name, '')
        print target_dir
        if not os.path.isdir(target_dir):
            try:
                os.makedirs(target_dir)
            except Exception, e:
                print "Can not create dir:", e
        file_obj = open(file_path, 'rb')
        target_file_obj = open(final_file_name, 'awb')
        sequence = 0
        while True:
            # 获取文件内容 一次读取大概1M的数据量，否则可能造成及其卡顿，影响正常使用
            print "正在转化..........", sequence
            sequence+=1
            file_content = file_obj.read(1000000)
            #如果读取的内容为空就终止循环
            if file_content == '':
                break
            # 判断文件内容的编码格式
            file_code = chardet.detect(file_content)
            # 解码并转码（必须先解码，才能够转码）
            if file_code['encoding'] == None:
                file_code['encoding'] = 'utf-8'
            unicode_file_content = file_content.decode(file_code['encoding'], 'ignore')
            target_file_content = unicode_file_content.encode(target_code, 'ignore')
            target_file_obj.write(target_file_content)
        file_obj.close()
        target_file_obj.close()
        return True
    except Exception:
        traceback.print_exc()
        return False

# 示例：转化为gbk
if __name__ == '__main__':
    if not os.path.isdir('gbk_file_data'):
        try:
            os.mkdir('gbk_file_data')
        except Exception, e:
            print "Can not create dir:", e
    if len(sys.argv) == 2:
        all_file_path = get_all_file_path(sys.argv[1], [])
    else:
        logging.error("Please input file path!")
        exit(1)
    for file_path in all_file_path:
        if file_encode(file_path, 'gbk_file_data/'+'gbk_'+file_path.split('/')[-1], "gbk"):
            print "%s--转码成功"%(file_path)
        else:
            print "%s--转码失败" % (file_path)
Python 遍历指定文件目录的所有文件并对其进行转码

相关推荐