# Talking Data Starter Code
# Part 2 Setting up the program
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
movieData = pd.read_csv('./rotten_tomatoes_movies.csv')
favMovie = "The Outsiders"
print("My favorite movie is " + favMovie)
# Part 3 Investigate the data
# print(movieData.head())
# print(movieData["movie_title"])
# Part 4 Filter data
print("\nThe data for my favorite movie is:\n")
# Create a new variable to store your favorite movie information
favMovieBooleanList = movieData["movie_title"] == favMovie
# print(favMovieBooleanList)
favMovieData = movieData.loc[favMovieBooleanList]
print(favMovieData)
print("\n\n")
# Create a new variable to store a new data set with a certain genre
dramaMovieBooleanList = movieData["genres"].str.contains("Drama")
dramaMovieData = movieData.loc[dramaMovieBooleanList]
numOfMovies = dramaMovieData.shape[0]
print("We will be comparing " + favMovie +
" to other movies under the genre Drama in the data set.\n")
print("There are " + str(numOfMovies) + " movies under the category Drama.")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
input("Press enter to see more information about how " + favMovie +
" compares to other movies in this genre.\n")
# Part 5 Describe data
# min
min = dramaMovieData["audience_rating"].min()
print("The min audience rating of the data set is: " + str(min))
print(favMovie + " is rated 78 points higher than the lowest rated movie.")
print()
# find max
max = dramaMovieData["audience_rating"].max()
print("The max audience rating of the data set is: " + str(max))
print(favMovie + " is rated 18 points lower than the highest rated movie.")
print()
# find mean
mean = dramaMovieData["audience_rating"].mean()
print("The mean audience rating of the data set is: " + str(mean))
print(favMovie + " is higher than the mean movie rating.")
# find median
median = dramaMovieData["audience_rating"].median()
print("The median audience rating of the data set is: " + str(median))
print(favMovie + " is higher than the median movie rating.")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
input("Press enter to see data visualizations.\n")
# Part 6 Create graphs
# Create histogram
plt.hist(dramaMovieData["audience_rating"], range = (0, 100), bins = 20)
# Adds labels and adjusts histogram
plt.grid(True)
plt.title("Audience Ratings of Drama Movies Histogram")
plt.xlabel("Audience Ratings")
plt.ylabel("Number of Drama Movies")
# Prints interpretation of histogram
print(
"According to the histogram, the highest audience rating range is 75-79 with about 880 movies. The Outsiders is significantly above that range, with an audience rating of 82."
)
print("Close the graph by pressing the 'X' in the top right corner.")
print()
# Show histogram
plt.show()
# Create scatterplot
plt.scatter(data = dramaMovieData, x = "audience_rating", y = "critic_rating")
# Adds labels and adjusts scatterplot
plt.grid(True)
plt.title("Audience Rating versus Critic Rating")
plt.xlabel("Audience Rating")
plt.ylabel("Critic Rating")
plt.xlim(0, 100)
plt.ylim(0, 100)
# Prints interpretation of scatterplot
print(
"According to the scatter plot, there is a positive correlation between audience rating and critic rating."
)
print()
print("Close the graph by pressing the 'X' in the top right corner.")
# Show scatterplot
plt.show()
print("\nThank you for reading through my data analysis!")My favorite movie is The Outsiders
The data for my favorite movie is:
movie_title year_released critic_rating audience_rating genres
10714 The Outsiders 1983 63 82 Drama
We will be comparing The Outsiders to other movies under the genre Drama in the data set.
There are 9020 movies under the category Drama.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The min audience rating of the data set is: 4
The Outsiders is rated 78 points higher than the lowest rated movie.
The max audience rating of the data set is: 100
The Outsiders is rated 18 points lower than the highest rated movie.
The mean audience rating of the data set is: 63.316297117516626
The Outsiders is higher than the mean movie rating.
The median audience rating of the data set is: 66.0
The Outsiders is higher than the median movie rating.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
According to the histogram, the highest audience rating range is 75-79 with about 880 movies. The Outsiders is significantly above that range, with an audience rating of 82.
Close the graph by pressing the 'X' in the top right corner.
According to the scatter plot, there is a positive correlation between audience rating and critic rating.
Close the graph by pressing the 'X' in the top right corner.
Thank you for reading through my data analysis!