Python 多线程图像下载和压缩 脚本

图像下载脚本:

#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2018. All rights reserved.
Created by C. L. Wang on 2018/7/9
"""

import argparse
import os
import shutil
from datetime import datetime
from multiprocessing.pool import Pool

import requests


def get_current_time_str():
    """
    输入当天的日期格式, 20170718_1137
    :return: 20170718_1137
    """
    return datetime.now().strftime('%Y%m%d%H%M%S')


logfile = 'download_log_{}.log'.format(get_current_time_str())  # 日志文件


def download_img(img_url, out_folder, imgs_names, img_name=None):
    """
    下载图片
    :param img_url: 图片URL
    :param out_folder: 输出文件夹
    :param imgs_names: 已有图片
    :param img_name: 图片名称
    :return: None
    """
    if not img_name:
        img_name = img_url.split('/')[-1]  # 图片文件名

    if img_name in imgs_names:
        print_info('图片已存在: %s' % img_name)
        return

    img_data = requests.get(img_url).content

    out_file = os.path.join(out_folder, img_name)  # 输出文件

    with open(out_file, 'wb') as hl:
        hl.write(img_data)
        print_info('图片已下载: %s' % img_name)


def download_imgs_for_mp(img_file, out_folder, n_prc=10, prefix=None):
    """
    多线程下载
    :param img_file: 图片文件
    :param out_folder: 输出文件夹
    :param prefix: 图片前缀
    :param n_prc: 进程数, 默认40个
    :return: None
    """
    print_info('进程总数: %s' % n_prc)
    pool = Pool(processes=n_prc)  # 多线程下载
    paths_list = read_file(img_file)
    print_info('文件数: %s' % len(paths_list))

    _, imgs_names = traverse_dir_files(out_folder)

    for (index, path) in enumerate(paths_list):
        if prefix:
            pool.apply_async(download_img, (path, out_folder, imgs_names, prefix + '_' + str(index) + '.jpg'))
        else:
            pool.apply_async(download_img, (path, out_folder, imgs_names))

    pool.close()
    pool.join()

    # _, imgs_names = traverse_dir_files(out_folder)
    # print_info('图片总数: %s' % len(imgs_names))
    print_info('全部下载完成')


def parse_args():
    """
    处理脚本参数,支持相对路径
    img_file 文件路径,默认文件夹:img_downloader/urls
    out_folder 输出文件夹,默认文件夹:img_data
    :return: arg_img,文件路径;out_folder,输出文件夹
    """
    parser = argparse.ArgumentParser(description='下载数据脚本')
    parser.add_argument('-i', dest='img_file', required=True, help='文件路径', type=str)
    parser.add_argument('-o', dest='out_folder', required=True, help='输出文件夹', type=str)
    parser.add_argument('-p', dest='n_prc', required=False, default=20, help='进程数', type=str)

    args = parser.parse_args()

    arg_img = args.img_file
    print_info("文件路径:%s" % arg_img)

    arg_out = args.out_folder
    print_info("输出文件夹:%s" % arg_out)

    arg_npc = args.n_prc
    print_info("进程数:%s" % arg_npc)
    return arg_img, arg_out, arg_npc


def write_line(file_name, line):
    """
    将行数据写入文件
    :param file_name: 文件名
    :param line: 行数据
    :return: None
    """
    if file_name == "":
        return
    with open(file_name, "a+", encoding='utf8') as fs:
        if type(line) is (tuple or list):
            fs.write("%s\n" % ", ".join(line))
        else:
            fs.write("%s\n" % line)


def print_info(log_str):
    """
    打印日志
    :param log_str: 日志信息
    :return: None
    """
    log_str = u'[Info {}] {}'.format(get_current_time_str(), str(log_str))
    write_line(logfile, log_str)
    print(log_str)


def mkdir_if_not_exist(dir_name, is_delete=False):
    """
    创建文件夹
    :param dir_name: 文件夹
    :param is_delete: 是否删除
    :return: 是否成功
    """
    try:
        if is_delete:
            if os.path.exists(dir_name):
                shutil.rmtree(dir_name)
                print('[Info] 文件夹 "%s" 存在, 删除文件夹.' % dir_name)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
            print('[Info] 文件夹 "%s" 不存在, 创建文件夹.' % dir_name)
        return True
    except Exception as e:
        print('[Exception] %s' % e)
        return False


def traverse_dir_files(root_dir, ext=None):
    """
    列出文件夹中的文件, 深度遍历
    :param root_dir: 根目录
    :param ext: 后缀名
    :return: [文件路径列表, 文件名称列表]
    """
    names_list = []
    paths_list = []
    for parent, _, fileNames in os.walk(root_dir):
        for name in fileNames:
            if name.startswith('.'):  # 去除隐藏文件
                continue
            if ext:  # 根据后缀名搜索
                if name.endswith(tuple(ext)):
                    names_list.append(name)
                    paths_list.append(os.path.join(parent, name))
            else:
                names_list.append(name)
                paths_list.append(os.path.join(parent, name))
    if not names_list:  # 文件夹为空
        return paths_list, names_list
    paths_list, names_list = sort_two_list(paths_list, names_list)
    return paths_list, names_list


def sort_two_list(list1, list2):
    """
    排序两个列表
    :param list1: 列表1
    :param list2: 列表2
    :return: 排序后的两个列表
    """
    list1, list2 = (list(t) for t in zip(*sorted(zip(list1, list2))))
    return list1, list2


def read_file(data_file, mode='more'):
    """
    读文件, 原文件和数据文件
    :return: 单行或数组
    """
    try:
        with open(data_file, 'r') as f:
            if mode == 'one':
                output = f.read()
                return output
            elif mode == 'more':
                output = f.readlines()
                output = [o.strip() for o in output]
                return output
            else:
                return list()
    except IOError:
        return list()


def main():
    """
    入口函数
    """
    arg_img, arg_out, arg_npc = parse_args()
    mkdir_if_not_exist(arg_out)  # 新建文件夹
    download_imgs_for_mp(arg_img, arg_out, arg_npc)


if __name__ == '__main__':
    main()

图像压缩脚本:

#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2018. All rights reserved.
Created by C. L. Wang on 2018/8/8
"""

import os
import shutil
import argparse

from multiprocessing import Pool
from PIL import Image


def sort_two_list(list1, list2):
    """
    排序两个列表
    :param list1: 列表1
    :param list2: 列表2
    :return: 排序后的两个列表
    """
    list1, list2 = (list(t) for t in zip(*sorted(zip(list1, list2))))
    return list1, list2


def mkdir_if_not_exist(dir_name, is_delete=False):
    """
    创建文件夹
    :param dir_name: 文件夹
    :param is_delete: 是否删除
    :return: 是否成功
    """
    try:
        if is_delete:
            if os.path.exists(dir_name):
                shutil.rmtree(dir_name)
                print('[Info] 文件夹 "%s" 存在, 删除文件夹.' % dir_name)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
            print('[Info] 文件夹 "%s" 不存在, 创建文件夹.' % dir_name)
        return True
    except Exception as e:
        print('[Exception] %s' % e)
        return False


def traverse_dir_files(root_dir, ext=None):
    """
    列出文件夹中的文件, 深度遍历
    :param root_dir: 根目录
    :param ext: 后缀名
    :return: [文件路径列表, 文件名称列表]
    """
    names_list = []
    paths_list = []
    for parent, _, fileNames in os.walk(root_dir):
        for name in fileNames:
            if name.startswith('.'):  # 去除隐藏文件
                continue
            if ext:  # 根据后缀名搜索
                if name.endswith(tuple(ext)):
                    names_list.append(name)
                    paths_list.append(os.path.join(parent, name))
            else:
                names_list.append(name)
                paths_list.append(os.path.join(parent, name))
    if not names_list:  # 文件夹为空
        return paths_list, names_list
    paths_list, names_list = sort_two_list(paths_list, names_list)
    return paths_list, names_list


def compress_img(in_path, out_path, size=1024):
    img = Image.open(in_path)
    img.thumbnail((size, size))
    img.save(out_path)
    print('Processed: {}'.format(out_path))


def process_folder(in_folder, out_folder, size=1024, n_prc=20):
    mkdir_if_not_exist(out_folder)  # 创建文件夹
    path_list, name_list = traverse_dir_files(in_folder)
    pool = Pool(processes=n_prc)  # 多线程下载

    for in_path, name in zip(path_list, name_list):
        out_path = os.path.join(out_folder, name)

        pool.apply_async(compress_img, (in_path, out_path, size))

    pool.close()
    pool.join()
    print('全部处理完成')


def parse_args():
    """
    处理脚本参数,支持相对路径
    :return: in_folder 输入文件夹, out_folder 输出文件夹, size 尺寸, n_prc 进程数
    """
    parser = argparse.ArgumentParser(description='压缩图片脚本')
    parser.add_argument('-i', dest='in_folder', required=True, help='输入文件夹', type=str)
    parser.add_argument('-o', dest='out_folder', required=True, help='输出文件夹', type=str)
    parser.add_argument('-s', dest='size', required=False, default=1024, help='最长边', type=str)
    parser.add_argument('-p', dest='n_prc', required=False, default=20, help='进程数', type=str)
    args = parser.parse_args()

    in_folder = args.in_folder
    print("文件路径:%s" % in_folder)

    out_folder = args.out_folder
    print("输出文件夹:%s" % out_folder)
    size = int(args.size)
    n_prc = int(args.n_prc)

    print('图片尺寸: {}, 进程数: {}'.format(size, n_prc))

    return in_folder, out_folder, size, n_prc


def main():
    arg_img, arg_out, size, n_prc = parse_args()
    mkdir_if_not_exist(arg_out)  # 新建文件夹
    process_folder(arg_img, arg_out, size, n_prc)


if __name__ == '__main__':
    main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章