dakkor
本帖最后由

本帖最后由 dakkor 于 2012-8-24 20:35 编辑

总之花了一点时间写了个原型,代码很丑陋,没有做异常抛出处理,估计bug也很多,就先将就着吧。调用了wget来下载,懒得再写urllib2的文件代码了,反正wget也很稳健。

[mw_shl_code=python,true]import os

import sys

from BeautifulSoup import BeautifulSoup

import urllib2

import re

Fullpath = sys.argv[1]

if os.path.isfile(Fullpath) == 0:

sys.exit()

filePath = os.path.dirname(Fullpath)

fileName = os.path.splitext(Fullpath)[0]

print fileName

fp = open(Fullpath,'r')

soup = BeautifulSoup(fp.read())

Links = soup.findAll('img')

downIMGpath = os.path.join(filePath,fileName)

print downIMGpath

if os.path.exists(downIMGpath) == 0:

os.mkdir(downIMGpath)

i = 0

for imgLink in Links:

imgsubName = re.sub('http://.+/.+\.','.',imgLink['src'])

cmdLine = 'wget ' + imgLink['src'] + ' -nv -t 10 -c -Y on -O ' + downIMGpath + '\\' + str(i) + imgsubName

os.system(cmdLine)

imgLink['src'] = downIMGpath + '/' + str(i) + imgsubName

i = i+1

fp.close()

fp = open(Fullpath,'w+')

fp.write(soup.prettify())

fp.close

[/mw_shl_code]