去除oss不在md文章中文件
去除oss不在md文章中文件
我将阿里云OSS上的文件全都转移到七牛云,阿里云查看图片会要下行流量,我对安全要求不高,迁移数据到性价比高的七牛云,会给一部分10G免费额度,够用。
可以点击以下链接试用:七牛云使用链接
在迁移的过程中发现有许多文件在我编写文章时没有同步删除的垃圾文件,编写一个python脚本比较一下,将没有存在的文件调用接口删除。脚本如下:
import os
import re
import urllib.parse
from qiniu import Auth, BucketManager
import requests
def connect_qiniu(access_key, secret_key, bucket_name):
"""
连接七牛云并返回BucketManager对象
"""
try:
q = Auth(access_key, secret_key)
bucket = BucketManager(q)
return bucket, bucket_name, q
except Exception as e:
print(f"连接七牛云失败: {e}")
return None, None, None
def get_qiniu_file_list(bucket, bucket_name, prefix='doc'):
"""
获取七牛云指定前缀下的所有文件名,并进行URL解码
返回原始键名和已解码文件名的映射
"""
osFileList = []
key_to_decoded = {} # 存储原始键名到解码后文件名的映射
try:
marker = None
while True:
ret, eof, info = bucket.list(bucket_name, prefix=prefix, marker=marker)
if ret is None:
print(f"获取文件列表失败: {info}")
break
for item in ret.get('items', []):
# 获取原始键名
original_key = item['key']
# 获取文件名(去掉前缀)
file_name = original_key.replace(prefix, '').lstrip('/')
if file_name: # 确保文件名不为空
# 对文件名进行URL解码
decoded_name = urllib.parse.unquote(file_name)
osFileList.append(decoded_name)
key_to_decoded[original_key] = decoded_name
if eof:
break
marker = ret.get('marker')
return osFileList, key_to_decoded
except Exception as e:
print(f"获取七牛云文件列表时出错: {e}")
return [], {}
def find_md_files(local_path):
"""
查找本地路径下所有的.md文件
"""
md_files = []
try:
for root, dirs, files in os.walk(local_path):
for file in files:
if file.endswith('.md'):
md_files.append(os.path.join(root, file))
return md_files
except Exception as e:
print(f"查找Markdown文件时出错: {e}")
return []
def extract_referenced_images(md_files):
"""
从Markdown文件中提取引用的图片文件名,并进行URL解码
"""
pageFileList = []
pattern = r'/doc/(.*?\.(?:png|jpg|jpeg|gif|bmp|webp))' # 支持多种图片格式
try:
for md_file in md_files:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
matches = re.findall(pattern, content, re.IGNORECASE)
# 对提取的所有文件名进行URL解码
decoded_matches = [urllib.parse.unquote(match) for match in matches]
pageFileList.extend(decoded_matches)
# 去重
return list(set(pageFileList))
except Exception as e:
print(f"提取图片引用时出错: {e}")
return []
def find_useless_files(osFileList, pageFileList):
"""
找出七牛云中存在但Markdown文件中未引用的文件
"""
try:
# 转换为集合进行差集运算
os_set = set(osFileList)
page_set = set(pageFileList)
# 找出在七牛云中但不在引用列表中的文件
useless_files = list(os_set - page_set)
return useless_files
except Exception as e:
print(f"对比文件列表时出错: {e}")
return []
def decode_all_filenames(file_list):
"""
对文件列表中的所有文件名进行URL解码
"""
return [urllib.parse.unquote(filename) for filename in file_list]
def delete_useless_files(bucket, bucket_name, auth, uselessFileList, key_to_decoded, prefix='/doc/'):
"""
删除七牛云中的无用文件
"""
if not uselessFileList:
print("没有需要删除的文件")
return
print(f"\n准备删除 {len(uselessFileList)} 个无用文件")
print("请确认要删除的文件列表:")
for i, filename in enumerate(sorted(uselessFileList), 1):
print(f"{i:3d}. {filename}")
# 确认删除
confirm = input("\n确定要删除这些文件吗?(y/N): ")
if confirm.lower() != 'y':
print("取消删除操作")
return
# 查找无用文件的原始键名
keys_to_delete = []
for decoded_name in uselessFileList:
# 查找对应的原始键名
for original_key, decoded_key in key_to_decoded.items():
if decoded_key == decoded_name:
keys_to_delete.append(original_key)
break
# 执行删除操作
deleted_count = 0
failed_deletions = []
for key in keys_to_delete:
try:
ret, info = bucket.delete(bucket_name, key)
if info.status_code == 200:
print(f"成功删除: {key}")
deleted_count += 1
else:
print(f"删除失败: {key} - 状态码: {info.status_code}")
failed_deletions.append(key)
except Exception as e:
print(f"删除异常: {key} - 错误: {e}")
failed_deletions.append(key)
# 输出删除结果
print(f"\n删除操作完成:")
print(f"成功删除: {deleted_count} 个文件")
if failed_deletions:
print(f"删除失败: {len(failed_deletions)} 个文件")
for key in failed_deletions:
print(f" - {key}")
def main():
# 七牛云配置
QINIU_ACCESS_KEY = '62knrQrpddGoaHNkPuYZT35tzsQCTd12vFxm9kmT'
QINIU_SECRET_KEY = ''
QINIU_BUCKET_NAME = 'poesy'
# 本地路径配置
LOCAL_MD_PATH = 'D:\life-doc\docs' # 替换为你的本地Markdown文件路径
# 步骤1: 连接七牛云并获取文件列表
print("步骤1: 连接七牛云...")
bucket, bucket_name, auth = connect_qiniu(QINIU_ACCESS_KEY, QINIU_SECRET_KEY, QINIU_BUCKET_NAME)
if not bucket:
return
print("获取七牛云文件列表...")
osFileList, key_to_decoded = get_qiniu_file_list(bucket, bucket_name, 'doc')
print(f"七牛云 /doc/ 文件夹下共有 {len(osFileList)} 个文件")
# 步骤2: 遍历本地Markdown文件并提取引用的图片
print("\n步骤2: 扫描本地Markdown文件...")
md_files = find_md_files(LOCAL_MD_PATH)
print(f"找到 {len(md_files)} 个Markdown文件")
pageFileList = extract_referenced_images(md_files)
print(f"从Markdown文件中提取到 {len(pageFileList)} 个引用的图片文件")
# 确保所有文件名都经过URL解码
osFileList_decoded = decode_all_filenames(osFileList)
pageFileList_decoded = decode_all_filenames(pageFileList)
# 打印一些示例,用于调试
print("\n=== 文件名示例 ===")
print("七牛云文件示例:")
for i, filename in enumerate(osFileList_decoded[:5]):
print(f" {i + 1}. {filename}")
print("\nMarkdown引用文件示例:")
for i, filename in enumerate(pageFileList_decoded[:5]):
print(f" {i + 1}. {filename}")
# 步骤3: 对比并找出无用文件
print("\n步骤3: 对比文件列表...")
uselessFileList = find_useless_files(osFileList_decoded, pageFileList_decoded)
# 打印结果
print(f"\n=== 结果统计 ===")
print(f"七牛云文件数量: {len(osFileList_decoded)}")
print(f"引用的文件数量: {len(pageFileList_decoded)}")
print(f"未引用的文件数量: {len(uselessFileList)}")
if uselessFileList:
print(f"\n=== 未引用的文件列表 ===")
for i, filename in enumerate(sorted(uselessFileList), 1):
print(f"{i:3d}. {filename}")
# 额外信息:显示可能的编码变体
print(f"\n=== 注意:以下文件可能有编码变体 ===")
variant_count = 0
for useless_file in uselessFileList:
encoded_variant = urllib.parse.quote(useless_file)
if encoded_variant != useless_file and encoded_variant in osFileList:
print(f"原始: {useless_file}")
print(f"编码: {encoded_variant}")
print("---")
variant_count += 1
if variant_count == 0:
print("未发现编码变体")
# 步骤4: 删除无用文件
print("\n步骤4: 删除无用文件...")
delete_useless_files(bucket, bucket_name, auth, uselessFileList, key_to_decoded, '/doc/')
else:
print("\n没有找到未引用的文件!")
if __name__ == "__main__":
main()