• 前端
  • JS
  • CSS
  • HTML
  • Mysql
  • Linux
  • SVN
  • 环境uedbet官网手机版最新
  • uedbet西甲体育投注详解
  • MAC_BOOK
  • 算法
  • 抓取豆瓣电影代码
    By skyshappiness Posted 2017-01-04 21:46:09 In

    #!/user/bin/python

    #encoding:utf-8

    import MySQLdb

    import urllib2

    from bs4 import BeautifulSoup

    import sys

    reload(sys)

    sys.setdefaultencoding("utf-8")

    #查找一条数据

    def findData(sql):

        db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")

        cusor = db.cursor()

        cusor.execute(sql)

        data = cusor.fetchone()

        db.close()

        return data

    #插入/更新 一条数据

    def insertData(sql):

        db = MySQLdb.connect(charset="utf8", host="localhost", user="root", passwd="", db="blog")

        cusor = db.cursor()

        cusor.execute(sql)

        db.commit()

        db.close()

    #抓取页面

    def grabContent(url):

        header = {

    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',}

        req = urllib2.Request(url, headers = header)

        con = urllib2.urlopen(req)

        doc = con.read()

        con.close()

        return doc

    #一切从零开始

    def startFromZero():

        data = grabContent('http://movie.douban.com/tag/')

        soup = BeautifulSoup(data, 'html.parser')

        for url in soup.find_all('a',{'class':'tag'}):

            sqlStr = "INSERT INTO `blog_movie_tag` (tag) VALUES ('"+str(url.get_text())+"')"

            insertData(sqlStr)

        grabMovieInfo()

    #开始抓取电影

    def grabMovieInfo():

        tag = findData("SELECT `tag` from blog_movie_tag WHERE status = '0' LIMIT 1 ")

        for i in range(0,35):

            if i != 34:

                startNum = i * 15

            else:

                updateTagSql = "UPDATE `blog_movie_tag` SET status = '1' where tag = "+tag[0]

                insertData(updateTagSql)

                startNum = 500

            url = "http://www.douban.com/tag/"+tag[0]+"/movie?start="+str(startNum)

            i += 1

            data = grabContent(url)

            soup = BeautifulSoup(data, 'html.parser')

            for url in soup.find_all('dl'):

                movieName = str(url.find('a',{'class':'title'}).get_text()).replace("'", "")

                movieYear = str(url.find('div',{'class':'desc'}).get_text())

                oldDataSql = "SELECT `id` FROM `blog_movie` WHERE movie_name = '"+movieName+"'"

                existId = findData(oldDataSql)

                if(existId == None):

                    movieSql = "INSERT INTO `blog_movie` (movie_name, movie_year) VALUES ('"+movieName+"','"+movieYear+"')"

                    insertData(movieSql)

                    grabMovieTag(url.find('a', {'class':'title'}).get('href'), movieName)

    #抓取电影详情页的标签

    def grabMovieTag(url, movieName):

        data = grabContent(url)

        soup = BeautifulSoup(data, 'html.parser')

        movieRate = soup.find('strong', {'class':'rating_num'}).get_text()

        updateMovieSql = "UPDATE `blog_movie` SET rate = '"+movieRate+"' WHERE movie_name = '"+movieName+"'"

        insertData(updateMovieSql)

        i=0

        for grabTag in soup.find('div', {'class':'tags-body'}).find_all_next('a'):

            i = i+1

            if(i < 9):

                oldDataSql = "SELECT `id` FROM `blog_movie_tag` WHERE tag = '" + str(grabTag.get_text()) + "'"

                existId = findData(oldDataSql)

                if(existId == None):

                    dataSql = "INSERT INTO `blog_movie_tag` (tag) VALUES ('" + str(grabTag.get_text()) + "')"

                    insertData(dataSql)

            else:

                break

    startFromZero()

    友情链接
    联系方式
  • 邮箱 / E-mail:121388038@qq.com