|
import re |
|
import json |
|
import requests |
|
from .parser import ImdbParser |
|
from requests_html import HTMLSession |
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning |
|
|
|
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) |
|
|
|
|
|
class IMDB: |
|
""" |
|
A class to represent IMDB API. |
|
|
|
-------------- |
|
|
|
Main Methods of the IMDB API |
|
-------------- |
|
#1. search(name, year=None, tv=False, person=False) |
|
-- to search a query on IMDB |
|
|
|
#2. get_by_name(name, year=None, tv=False) |
|
-- to get a Movie/TV-Series info by it's name (pass year also to increase accuracy) |
|
|
|
#3. get_by_id(file_id) |
|
-- to get a Movie/TV-Series info by it's IMDB-ID (pass year also to increase accuracy) |
|
|
|
#4. person_by_name(name) |
|
-- to get a person's info by his/her name |
|
|
|
#5. person_by_id( p_id) |
|
-- to get a person's info by his/her IMDB-ID |
|
|
|
#6. upcoming(region=None) |
|
-- to get upcoming movies/TV-Series |
|
|
|
#7. popular_movies(genre=None, start_id=1, sort_by=None) |
|
-- to get IMDB popular movies |
|
|
|
#8. popular_tv(genre=None, start_id=1, sort_by=None) |
|
-- to get IMDB popular Tv-Series |
|
""" |
|
def __init__(self): |
|
self.session = HTMLSession() |
|
self.headers = { |
|
"Accept": "application/json, text/plain, */*", |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36", |
|
"Referer": "https://www.imdb.com/" |
|
} |
|
self.baseURL = "https://www.imdb.com" |
|
self.search_results = {'result_count': 0, 'results': []} |
|
self.NA = json.dumps({"status": 404, "message": "No Result Found!", 'result_count': 0, 'results': []}) |
|
|
|
|
|
def search(self, name, year=None, tv=False, person=False): |
|
""" |
|
@description:- Helps to search a query on IMDB. |
|
@parameter-1:- <str:name>, query value to search. |
|
@parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search. |
|
@parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search results only for 'TV Series'. |
|
@parameter-4:- <bool:person> OPTIONAL, to filter search results only for person. |
|
@returns:- A JSON string: |
|
- {'result_count': <int:total_search_results>, 'results': <list:list_of_files/movie_info_dict>} |
|
""" |
|
assert isinstance(name, str) |
|
self.search_results = {'result_count': 0, 'results': []} |
|
|
|
name = name.replace(" ", "+") |
|
|
|
if year is None: |
|
url = f"https://www.imdb.com/find?q={name}" |
|
else: |
|
assert isinstance(year, int) |
|
url = f"https://www.imdb.com/find?q={name}+{year}" |
|
|
|
|
|
try: |
|
response = self.session.get(url) |
|
except requests.exceptions.ConnectionError as e: |
|
response = self.session.get(url, verify=False) |
|
|
|
|
|
results = response.html.xpath("//section[@data-testid='find-results-section-title']/div/ul/li") |
|
|
|
if tv is True: |
|
results = [result for result in results if "TV" in result.text] |
|
|
|
if person is True: |
|
results = response.html.xpath("//section[@data-testid='find-results-section-name']/div/ul/li") |
|
results = [result for result in results if 'name' in result.find('a')[0].attrs['href']] |
|
|
|
output = [] |
|
for result in results: |
|
name = result.text.replace('\n', ' ') |
|
url = result.find('a')[0].attrs['href'] |
|
if ('Podcast' not in name) and ('Music Video' not in name): |
|
try: |
|
image = result.xpath("//img")[0].attrs['src'] |
|
file_id = url.split('/')[2] |
|
output.append({ |
|
'id': file_id, |
|
"name": name, |
|
"url": f"https://www.imdb.com{url}", |
|
"poster": image |
|
}) |
|
except IndexError: |
|
pass |
|
self.search_results = {'result_count': len(output), 'results': output} |
|
return json.dumps(self.search_results, indent=2) |
|
|
|
|
|
def get(self, url): |
|
""" |
|
@description:- helps to get a file's complete info (used by get_by_name() & get_by_id() ) |
|
@parameter:- <str:url>, url of the file/movie/tv-series. |
|
@returns:- File/movie/TV info as JSON string. |
|
""" |
|
try: |
|
response = self.session.get(url) |
|
result = response.html.xpath("//script[@type='application/ld+json']")[0].text |
|
result = ''.join(result.splitlines()) |
|
result = f"""{result}""" |
|
|
|
except IndexError: |
|
return self.NA |
|
try: |
|
|
|
result = json.loads(result) |
|
except json.decoder.JSONDecodeError as e: |
|
|
|
try: |
|
to_parse = ImdbParser(result) |
|
|
|
parsed = to_parse.remove_trailer |
|
parsed = to_parse.remove_description |
|
|
|
result = json.loads(parsed) |
|
except json.decoder.JSONDecodeError as e: |
|
try: |
|
|
|
parsed = to_parse.remove_review_body |
|
result = json.loads(parsed) |
|
except json.decoder.JSONDecodeError as e: |
|
|
|
return self.NA |
|
|
|
output = { |
|
"type": result.get('@type'), |
|
"name": result.get('name'), |
|
"url": self.baseURL + result.get('url').split("/title")[-1], |
|
"poster": result.get('image'), |
|
"description": result.get('description'), |
|
"review": { |
|
"author": result.get("review", {'author': {'name': None}}).get('author').get('name'), |
|
"dateCreated": result.get("review", {"dateCreated": None}).get("dateCreated"), |
|
"inLanguage": result.get("review", {"inLanguage": None}).get("inLanguage"), |
|
"heading": result.get("review", {"name": None}).get("name"), |
|
"reviewBody": result.get("review", {"reviewBody": None}).get("reviewBody"), |
|
"reviewRating": { |
|
"worstRating": result.get("review", {"reviewRating": {"worstRating": None}}) |
|
.get("reviewRating",{"worstRating": None}).get("worstRating"), |
|
"bestRating": result.get("review", {"reviewRating": {"bestRating": None}}) |
|
.get("reviewRating",{"bestRating": None}).get("bestRating"), |
|
"ratingValue": result.get("review", {"reviewRating": {"ratingValue": None}}) |
|
.get("reviewRating",{"ratingValue": None}).get("ratingValue"), |
|
}, |
|
}, |
|
"rating": { |
|
"ratingCount": result.get("aggregateRating", {"ratingCount": None}).get("ratingCount"), |
|
"bestRating": result.get("aggregateRating", {"bestRating": None}).get("bestRating"), |
|
"worstRating": result.get("aggregateRating", {"worstRating": None}).get("worstRating"), |
|
"ratingValue": result.get("aggregateRating", {"ratingValue": None}).get("ratingValue"), |
|
}, |
|
"contentRating": result.get("contentRating"), |
|
"genre": result.get("genre"), |
|
"datePublished": result.get("datePublished"), |
|
"keywords": result.get("keywords"), |
|
"duration": result.get("duration"), |
|
"actor": [ |
|
{"name": actor.get("name"), "url": actor.get("url")} for actor in result.get("actor", []) |
|
], |
|
"director": [ |
|
{"name": director.get("name"), "url": director.get("url")} for director in result.get("director", []) |
|
], |
|
"creator": [ |
|
{"name": creator.get("name"), "url": creator.get("url")} for creator in result.get("creator", []) |
|
if creator.get('@type') == 'Person' |
|
] |
|
} |
|
return json.dumps(output, indent=2) |
|
|
|
def get_by_name(self, name, year=None, tv=False): |
|
""" |
|
@description:- Helps to search a file/movie/tv by name. |
|
@parameter-1:- <str:name>, query/name to search. |
|
@parameter-2:- <int:year> OPTIONAL, release year of query/movie/tv/file to search. |
|
@parameter-3:- <bool:tv> OPTIONAL, to filter/limit/bound search result only for 'TV Series'. |
|
@returns:- File/movie/TV info as JSON string. |
|
""" |
|
results = json.loads(self.search(name, year=year)) |
|
all_results = [i for i in self.search_results['results'] if 'title' in i['url']] |
|
|
|
|
|
|
|
if tv is True: |
|
tv_only = [result for result in all_results if "TV" in result['name']] |
|
if year is not None: |
|
tv_only = [result for result in tv_only if str(year) in result['name']] |
|
|
|
if bool(tv_only): |
|
tv_only_checked = [result for result in tv_only if result['name'].lower().startswith(name.split(" ")[0].lower())] |
|
tv_only = tv_only_checked if bool(tv_only_checked) else tv_only |
|
results['results'] = tv_only if bool(tv_only) else all_results |
|
|
|
else: |
|
movie_only = [result for result in all_results if "TV" not in result['name']] |
|
if year is not None: |
|
movie_only = [result for result in movie_only if str(year) in result['name']] |
|
|
|
if bool(movie_only): |
|
movie_only_checked = [result for result in movie_only if result['name'].lower().startswith(name.split(" ")[0].lower())] |
|
movie_only = movie_only_checked if bool(movie_only_checked) else movie_only |
|
results['results'] = movie_only if bool(movie_only) else all_results |
|
|
|
|
|
if len(results['results']) > 0: |
|
return self.get(results['results'][0].get('url')) |
|
else: |
|
return self.NA |
|
|
|
def get_by_id(self, file_id): |
|
""" |
|
@description:- Helps to search a file/movie/tv by its imdb ID. |
|
@parameter-1:- <str:file_id>, imdb ID of the file/movie/tv. |
|
@returns:- File/movie/TV info as JSON string. |
|
""" |
|
assert isinstance(file_id, str) |
|
url = f"{self.baseURL}/title/{file_id}" |
|
return self.get(url) |
|
|
|
|
|
def get_person(self, url): |
|
""" |
|
@description:- Helps to search a person info by its url, (used by person_by_name() & person_by_id() ). |
|
@parameter-1:- <str:url>, url of the person's profile page. |
|
@returns:- Person's info as JSON string. |
|
""" |
|
try: |
|
response = self.session.get(url) |
|
result = response.html.xpath("//script[@type='application/ld+json']")[0].text |
|
result = f"""{result}""" |
|
result = json.loads(result) |
|
except json.decoder.JSONDecodeError as e: |
|
return self.NA |
|
|
|
del result["@context"] |
|
result['type'] = result.get('@type') |
|
del result["@type"] |
|
return json.dumps(result, indent=2) |
|
|
|
def person_by_name(self, name): |
|
""" |
|
@description:- Helps to search a person info by its name. |
|
@parameter-1:- <str:name>, name of the person. |
|
@returns:- Person's info as JSON string. |
|
""" |
|
results = json.loads(self.search(name, person=True)) |
|
|
|
url = results['results'][0].get('url') |
|
return self.get_person(url) |
|
|
|
def person_by_id(self, p_id): |
|
""" |
|
@description:- Helps to search a person info by its imdb ID. |
|
@parameter-1:- <str:p_id>, imdb ID of the person's profile. |
|
@returns:- Person's info as JSON string. |
|
""" |
|
assert isinstance(p_id, str) |
|
url = f"{self.baseURL}/name/{p_id}" |
|
return self.get_person(url) |
|
|
|
|
|
def upcoming(self, region=None): |
|
""" |
|
@description:- Helps to get upcoming movies/tv-series. |
|
@parameter-1:- <str:region> OPTIONAL, country code (like US, IN etc.) to filter results by region/country. |
|
@returns:- upcoming movies/TV-Series info as JSON string. |
|
""" |
|
if region is not None: |
|
assert isinstance(region, str) |
|
url = f"https://www.imdb.com/calendar?region={region}" |
|
else: |
|
url = "https://www.imdb.com/calendar" |
|
|
|
try: |
|
response = self.session.get(url) |
|
except requests.exceptions.ConnectionError as e: |
|
response = self.session.get(url, verify=False) |
|
|
|
output = [] |
|
div = response.html.xpath("//main")[0] |
|
|
|
articles = div.find('article') |
|
for article in articles: |
|
h3 = article.find('h3')[0] |
|
ul = article.xpath('//ul')[0].xpath('//li') |
|
for li in ul: |
|
try: |
|
movie = li.find('a')[0] |
|
poster = ul[0].find('img')[0].attrs.get('src') |
|
output.append({ |
|
'id': movie.attrs['href'].split('/')[2], |
|
'name': movie.text, |
|
'url': self.baseURL + movie.attrs['href'], |
|
'release_data': h3.text, |
|
'poster': poster.split(',')[0] |
|
}) |
|
except IndexError: |
|
pass |
|
|
|
results = {'result_count': len(output), 'results': output} |
|
if results['result_count'] > 0: |
|
return json.dumps(results, indent=2) |
|
else: |
|
return self.NA |
|
|
|
|
|
def get_popular(self, url): |
|
""" |
|
@description:- Helps to search popular movies/TV-Series by url, (used by popular_movies() & popular_tv() ). |
|
@parameter-1:- <str:url>, url to search. |
|
@returns:- Files/Movies/TV-Series info as JSON string. |
|
""" |
|
assert isinstance(url, str) |
|
try: |
|
response = self.session.get(url) |
|
except requests.exceptions.ConnectionError as e: |
|
response = self.session.get(url, verify=False) |
|
|
|
all_li = response.html.xpath('//ul[@role="presentation"]/li') |
|
|
|
output = [] |
|
|
|
for li in all_li: |
|
for obj in li.find('a'): |
|
if ("title" in obj.attrs.get('href')) and (". " in obj.text): |
|
href = obj.attrs.get('href') |
|
name = obj.text.split(". ")[-1] |
|
break |
|
|
|
|
|
for span in li.find('span'): |
|
if len(span.text.strip()) == 4: |
|
try: |
|
year = int(span.text.strip()) |
|
break |
|
except: |
|
year = "N/A" |
|
|
|
|
|
try: |
|
file_id = href.split('/')[2] |
|
poster = li.xpath("//img[@loading='lazy']") |
|
poster = poster[0].attrs.get('src') |
|
poster = poster if bool(poster) else 'image_not_found' |
|
except: |
|
poster = 'image_not_found' |
|
|
|
output.append({ |
|
'id': file_id, |
|
'name': name, |
|
'year': year, |
|
'url': self.baseURL + href, |
|
'poster': poster |
|
}) |
|
|
|
self.search_results = {'result_count': len(output), 'results': output} |
|
return json.dumps(self.search_results, indent=2) |
|
|
|
def popular_movies(self, genre=None, start_id=1, sort_by=None): |
|
""" |
|
@description:- Helps to get 50 popular movies starting from <start_id>. |
|
@parameter-1:- <str:genre> OPTIONAL, to filter results by genre. |
|
@parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50). |
|
@parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc). |
|
- (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info) |
|
@returns:- Popular Movies (by genre) info as JSON string. |
|
""" |
|
assert isinstance(start_id, int) |
|
if genre is not None: |
|
assert isinstance(genre, str) |
|
url = f"https://www.imdb.com/search/title/?title_type=movie&genres={genre}&start={start_id}&sort={sort_by}" |
|
else: |
|
url = f"https://www.imdb.com/search/title/?title_type=movie&start={start_id}&sort={sort_by}" |
|
return self.get_popular(url) |
|
|
|
def popular_tv(self, genre=None, start_id=1, sort_by=None): |
|
""" |
|
@description:- Helps to get 50 popular TV-Series starting from <start_id>. |
|
@parameter-1:- <str:genre> OPTIONAL, to filter results by genre. |
|
@parameter-2:- <int:start_id> DEFAULT=1, start id to show results (shows results from start_id to start_id+50). |
|
@parameter-3:- <bool:sort_by> OPTIONAL, to sort results (eg. sort=user_rating,desc OR sort=user_rating,asc). |
|
- (visit 'https://www.imdb.com/search/title/?title_type=movie' for more info) |
|
@returns:- Popular TV-Series info as JSON string. |
|
""" |
|
assert isinstance(start_id, int) |
|
if genre is not None: |
|
assert isinstance(genre, str) |
|
url = f"https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres={genre}&start={start_id}&sort={sort_by}" |
|
else: |
|
url = f"https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&start={start_id}&sort={sort_by}" |
|
|
|
return self.get_popular(url) |
|
|
|
|