The Algorithms logo
算法
关于我们捐赠

Netflix 爬虫

H
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Netflix Scrapper\n",
    "\n",
    "The purpose of the code is to get details of all the Categories on Netflix and then to gather information about Sub-Categories and movies under each Sub-Category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_soup(url):\n",
    "    return BeautifulSoup(requests.get(url).text, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def browseCategory(category, data):\n",
    "    category_url = data[category-1][2]\n",
    "    category = data[category-1][1]\n",
    "    subCategory_details = []\n",
    "    count = 1\n",
    "    subCategories = []\n",
    "    soup = make_soup(category_url)\n",
    "    cards_list = soup.find_all('section',{'class':'nm-collections-row'})\n",
    "    for card in cards_list:\n",
    "        try:\n",
    "            subCategory = card.find('h1').text\n",
    "            movie_list = []\n",
    "            movies = card.find_all('li')\n",
    "            movie_count = 1\n",
    "            for movie in movies:\n",
    "                try:\n",
    "                    movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text\n",
    "                    movie_link = movie.find('a').get('href')\n",
    "                    movie_list.append([movie_count, movie_title , movie_link])\n",
    "                    movie_count += 1\n",
    "                except AttributeError:\n",
    "                    pass\n",
    "            subCategories.append(subCategory)\n",
    "            subCategory_details.append(movie_list)\n",
    "            count += 1\n",
    "        except AttributeError:\n",
    "            pass\n",
    "    return subCategories, subCategory_details, count-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getCategories(base_url):\n",
    "    category_soup = make_soup(base_url)\n",
    "    categories = category_soup.find_all('section',{'class':'nm-collections-row'})\n",
    "    result=[]\n",
    "    count = 1\n",
    "    for category in categories:\n",
    "        try:\n",
    "            Title = category.find('span', {'class':'nm-collections-row-name'}).text\n",
    "            url = category.find('a').get('href')\n",
    "            result.append([count, Title, url])\n",
    "            count += 1\n",
    "        except AttributeError:\n",
    "            pass\n",
    "    #print(result)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main():\n",
    "    netflix_url = \"https://www.netflix.com/in/browse/genre/839338\"\n",
    "    categories = getCategories(netflix_url)\n",
    "    print(\"Please select one of the category\")\n",
    "    df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])\n",
    "    print(df.to_string(index=False))\n",
    "    choice = int(input('\\n\\n Please Enter your Choice: \\n'))\n",
    "    subCategories, movieList, count = browseCategory(choice, categories)\n",
    "    for i in range(0, count):\n",
    "        print(subCategories[i],'\\n\\n')\n",
    "        subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])\n",
    "        print(subCategory_df.to_string(index=False))\n",
    "        print(\"\\n\\n\\n\")\n",
    "    \n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
关于此算法

Netflix 爬虫

代码的目的是获取 Netflix 上所有类别的详细信息,然后收集每个子类别下子类别和电影的信息。

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
def make_soup(url):
    return BeautifulSoup(requests.get(url).text, 'html.parser')
def browseCategory(category, data):
    category_url = data[category-1][2]
    category = data[category-1][1]
    subCategory_details = []
    count = 1
    subCategories = []
    soup = make_soup(category_url)
    cards_list = soup.find_all('section',{'class':'nm-collections-row'})
    for card in cards_list:
        try:
            subCategory = card.find('h1').text
            movie_list = []
            movies = card.find_all('li')
            movie_count = 1
            for movie in movies:
                try:
                    movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text
                    movie_link = movie.find('a').get('href')
                    movie_list.append([movie_count, movie_title , movie_link])
                    movie_count += 1
                except AttributeError:
                    pass
            subCategories.append(subCategory)
            subCategory_details.append(movie_list)
            count += 1
        except AttributeError:
            pass
    return subCategories, subCategory_details, count-1
def getCategories(base_url):
    category_soup = make_soup(base_url)
    categories = category_soup.find_all('section',{'class':'nm-collections-row'})
    result=[]
    count = 1
    for category in categories:
        try:
            Title = category.find('span', {'class':'nm-collections-row-name'}).text
            url = category.find('a').get('href')
            result.append([count, Title, url])
            count += 1
        except AttributeError:
            pass
    #print(result)
    return result
def main():
    netflix_url = "https://www.netflix.com/in/browse/genre/839338"
    categories = getCategories(netflix_url)
    print("Please select one of the category")
    df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])
    print(df.to_string(index=False))
    choice = int(input('\n\n Please Enter your Choice: \n'))
    subCategories, movieList, count = browseCategory(choice, categories)
    for i in range(0, count):
        print(subCategories[i],'\n\n')
        subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])
        print(subCategory_df.to_string(index=False))
        print("\n\n\n")
    
if __name__ == '__main__':
    main()