出发点
由于最近一直忙于学业,码代码的手犯痒了,博客也好久没有进行更新,今天忙里偷闲来点干货。一直都觉得重要的东西放别人家的服务器上有点不踏实,用豆瓣做个人读书系统管理特别方便,于是,今天花了几个小时把豆瓣上的个人读书数据进行了完整的本地离线备份。不多说,上干货。
干货
#作者:冰蓝
#豆瓣个人域名,可通过个人主页的网址查询,例如本人主页地址为https://www.douban.com/people/lanbing510/,域名为lanbing510
user=lanbing510
#下载网页
wget --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --no-parent https://book.douban.com/people/$user/
#提取网页中需要的元素并下载到本地
for postfix in jpg js png
do
find ./ -type f -name "*.html" -exec cat {} \; | grep "\.$postfix" | sed "s/\(.*\)src=\"\(.*\.$postfix\)\"\(.*\)/\2/g" | grep "http" | grep "doubanio" >$postfix.txt
done
for postfix in css ico
do
find ./ -type f -name "*.html" -exec cat {} \; | grep "\.$postfix" | sed "s/\(.*\)href=\"\(.*\.$postfix\)\"\(.*\)/\2/g" | grep "http" | grep "doubanio" >$postfix.txt
done
for postfix in jpg css js png ico
do
sort -k2n $postfix.txt | uniq >$postfix-out.txt
done
for f in jpg-out.txt css-out.txt js-out.txt png-out.txt ico-out.txt
do
for line in $(cat $f)
do
dest=${line#*/}
wget -c $line -P ./book.douban.com/people/lanbing510${dest%/*}
done
done
#替换网页中的链接为本地链接
for postfix in jpg js png
do
find ./ -type f -name "*.html" -exec sed -i "s/\(.*\)src=\"https:\/\/\(.*\.$postfix\)\(.*\)/\1src=\"\.\/\2\3/g" {} \;
done
for postfix in css ico
do
find ./ -type f -name "*.html" -exec sed -i "s/\(.*\)href=\"https:\/\/\(.*\.$postfix\)\(.*\)/\1href=\"\.\/\2\3/g" {} \;
done
find ./ -type f -name "*.html" -exec sed -i "s/\(.*\)href=\"https:\/\/book\.douban\.com\/people\/$user\/\(.*\)/\1href=\"\2/g" {} \;
#清理中间文件
for postfix in jpg js png css ico
do
rm $postfix.txt
rm $postfix-out.txt
done
#完成
echo "success :)"
以上。