需求分析
简书的文章需要放到自己的博客里
保存图片并且将文字内容写到XX.MD文件
人生苦短,我用Python
Python实现
URL=皮皮虾视频的分享链接然后做个拼接就可以使用了
需要用到的包有
- re(自带)
- os(自带)
- time(自带)
- requests(初次安装 pip install requests)
用request.get()爬取到HTML源码然后存在本地
然后当文本文件逐行读取
符合规则就写入到MD文件中
源码
'''
简书转为Markdown文件方便转为博客文件
并下载图片文件
2021-9-6
'''
import requests,re,os,time
from bs4 import BeautifulSoup
def jianshutomd(url):
#url='https://www.jianshu.com/p/cf65f7eb14e2'
headers = {'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'}
req = requests.get(url=url, headers=headers)
html=req.text
soup = BeautifulSoup(html, 'lxml')
#获得网页标题
htmltitle=soup.title.text
#将网页标题的空格替换成-
htmltitle=str.replace(htmltitle,' ','-')
htmltitle=str.replace(htmltitle,'---简书','')
htmltitle=str.replace(htmltitle,'/','-')
print(htmltitle)
#按照网页标题创建文件夹
if not os.path.exists(htmltitle):
os.makedirs(htmltitle)
#将网页保存为HTML文件
with open(htmltitle+'.html', 'w',encoding='utf-8') as f:
f.write(html)
mdtext="---\ntitle: "+htmltitle+"\ncategories: 类别\ndate: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+"\n---\n"
f = open(htmltitle+'.html',encoding='utf-8')
lines = f.readlines()
for line in lines:
#print(line)
if line[0:3]=='<p>' or line[0:9]=='</div><p>':
mdtext+=re.findall('<p>(.*?)</p>',line)[0]+'\n'
if line[0:24]=='<div class="image-view" ':
mdtext += '"', line)[0]+ ')\n'
pngname=re.findall('upload_images/(.*?)"', line)[0]
geturl='http:'+re.findall('-src="(.*?)"', line)[0]
print(geturl)
print(pngname)
req=requests.get(geturl,headers=headers)
f = open(htmltitle+'/'+pngname, 'wb')
f.write(req.content)
f.close()
with open(htmltitle+'.md','w',encoding='utf-8') as f:
f.write(mdtext)
if __name__ == '__main__':
while 1:
url=input("请输入文章URL地址:")
jianshutomd(url)