首页 > 脚本语言 > python > python爬取数据保存入库
2018
09-13

python爬取数据保存入库

python爬取数据保存入库
源代码仅供参考

import urllib2

import re

import MySQLdb

class LatestTest:
    #初始化
    def __init__(self):
        self.url="https://toutiao.io/latest"
        self.UserAgent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
        self.header={'User-Agent':self.UserAgent}
        
    #获取URL、标题、邮箱 保存到list
    def getDate(self):
        
        request=urllib2.Request(self.url,headers=self.header)
        respone=urllib2.urlopen(request).read()
        #print respone
        content=re.compile(r'<div class="post">.*?class="title">.*?href="(.*?)">(.*?)</a>.*?<div class="meta">.*?<span>(.*?)</span>',re.S)
        urls=re.findall(content,respone)
        namelist=[]

        for url in urls:
                #print url[0],url[1],url[2]
            namelist.append([url[0].strip(),url[1].strip(),url[2].strip()])
            if len(namelist)>=10:
                break
        
        return namelist
    #保存数据到mysql数据库
    def savaDateMysql(self,url,title,email):
        sql="insert into content(url,title,email)values('%s','%s','%s')" %(url,title,email)
        try: 
            
            conn=MySQLdb.connect('localhost','root','g6s8m3t7s','mysql',charset='utf8')
            cursor=conn.cursor()
           # cursor.execute('create table IF NOT EXISTS content(id int AUTO_INCREMENT PRIMARY KEY,url varchar(100),title varchar(100),email varchar(100))')
            #cursor.execute('drop table IF EXISTS content')
            cursor.execute(sql)
            conn.commit()
        except Exception,e:
            print e
        finally:
            conn.close()


if __name__=='__main__':
    lat=LatestTest()    
    contentlist=lat.getDate()
    try:
        for tent in contentlist:
            url=tent[0].strip()
            title=tent[1].strip()
            email=tent[2].strip()
            print url,title,email
            lat.savaDateMysql(url,title,email)
    except Exception,e:
        print e
最后编辑:
作者:admin

留下一个回复