{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Netflix Scrapper\n",
"\n",
"The purpose of the code is to get details of all the Categories on Netflix and then to gather information about Sub-Categories and movies under each Sub-Category."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def make_soup(url):\n",
" return BeautifulSoup(requests.get(url).text, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def browseCategory(category, data):\n",
" category_url = data[category-1][2]\n",
" category = data[category-1][1]\n",
" subCategory_details = []\n",
" count = 1\n",
" subCategories = []\n",
" soup = make_soup(category_url)\n",
" cards_list = soup.find_all('section',{'class':'nm-collections-row'})\n",
" for card in cards_list:\n",
" try:\n",
" subCategory = card.find('h1').text\n",
" movie_list = []\n",
" movies = card.find_all('li')\n",
" movie_count = 1\n",
" for movie in movies:\n",
" try:\n",
" movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text\n",
" movie_link = movie.find('a').get('href')\n",
" movie_list.append([movie_count, movie_title , movie_link])\n",
" movie_count += 1\n",
" except AttributeError:\n",
" pass\n",
" subCategories.append(subCategory)\n",
" subCategory_details.append(movie_list)\n",
" count += 1\n",
" except AttributeError:\n",
" pass\n",
" return subCategories, subCategory_details, count-1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def getCategories(base_url):\n",
" category_soup = make_soup(base_url)\n",
" categories = category_soup.find_all('section',{'class':'nm-collections-row'})\n",
" result=[]\n",
" count = 1\n",
" for category in categories:\n",
" try:\n",
" Title = category.find('span', {'class':'nm-collections-row-name'}).text\n",
" url = category.find('a').get('href')\n",
" result.append([count, Title, url])\n",
" count += 1\n",
" except AttributeError:\n",
" pass\n",
" #print(result)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def main():\n",
" netflix_url = \"https://www.netflix.com/in/browse/genre/839338\"\n",
" categories = getCategories(netflix_url)\n",
" print(\"Please select one of the category\")\n",
" df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])\n",
" print(df.to_string(index=False))\n",
" choice = int(input('\\n\\n Please Enter your Choice: \\n'))\n",
" subCategories, movieList, count = browseCategory(choice, categories)\n",
" for i in range(0, count):\n",
" print(subCategories[i],'\\n\\n')\n",
" subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])\n",
" print(subCategory_df.to_string(index=False))\n",
" print(\"\\n\\n\\n\")\n",
" \n",
"if __name__ == '__main__':\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
代码的目的是获取 Netflix 上所有类别的详细信息,然后收集每个子类别下子类别和电影的信息。
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
def make_soup(url):
return BeautifulSoup(requests.get(url).text, 'html.parser')
def browseCategory(category, data):
category_url = data[category-1][2]
category = data[category-1][1]
subCategory_details = []
count = 1
subCategories = []
soup = make_soup(category_url)
cards_list = soup.find_all('section',{'class':'nm-collections-row'})
for card in cards_list:
try:
subCategory = card.find('h1').text
movie_list = []
movies = card.find_all('li')
movie_count = 1
for movie in movies:
try:
movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text
movie_link = movie.find('a').get('href')
movie_list.append([movie_count, movie_title , movie_link])
movie_count += 1
except AttributeError:
pass
subCategories.append(subCategory)
subCategory_details.append(movie_list)
count += 1
except AttributeError:
pass
return subCategories, subCategory_details, count-1
def getCategories(base_url):
category_soup = make_soup(base_url)
categories = category_soup.find_all('section',{'class':'nm-collections-row'})
result=[]
count = 1
for category in categories:
try:
Title = category.find('span', {'class':'nm-collections-row-name'}).text
url = category.find('a').get('href')
result.append([count, Title, url])
count += 1
except AttributeError:
pass
#print(result)
return result
def main():
netflix_url = "https://www.netflix.com/in/browse/genre/839338"
categories = getCategories(netflix_url)
print("Please select one of the category")
df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])
print(df.to_string(index=False))
choice = int(input('\n\n Please Enter your Choice: \n'))
subCategories, movieList, count = browseCategory(choice, categories)
for i in range(0, count):
print(subCategories[i],'\n\n')
subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])
print(subCategory_df.to_string(index=False))
print("\n\n\n")
if __name__ == '__main__':
main()