People ask me to do more videos so I’m making 2 weeks full of videos on Statistics with Python. Every day at 1 PM Eastern Standard Time I’ll upload a new video. If this goes well, maybe I’ll go back to every day uploads like I used to do? It is up to you and the number of views these videos get.
This tutorial series will cover all of the Statistics needed for Data Science and machine learning. I’ll also write all the Python code to make everything work. I hope you like it.
Links to All the Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import math def mean(*args): val_sum = sum(args) return val_sum / len(args) def median(*args): if len(args) % 2 == 0: i = round((len(args) + 1) / 2) j = i - 1 return (args[i] + args[j]) / 2 else: k = round(len(args) / 2) return args[k] def mode(*args): # Count how many times values show up in # the list and put it in a dictionary dict_vals = {i: args.count(i) for i in args} # Create a list of keys that have the maximum # number of occurrence in the list max_list = [k for k, v in dict_vals.items() if v == max(dict_vals.values())] return max_list def variance(*args): mean_val = mean(*args) numerator = 0 for i in args: numerator += (i - mean_val) ** 2 denominator = len(args) - 1 return numerator / denominator def standard_deviation(*args): return math.sqrt(variance(*args)) def coefficient_variation(*args): return standard_deviation(*args) / mean(*args) def covariance(*args): # Use a list comprehension to get all values # stored in the 1st & 2nd list list_1 = [i[0] for i in args] list_2 = [i[1] for i in args] # Pass those lists to get their means list_1_mean = mean(*list_1[0]) list_2_mean = mean(*list_2[0]) numerator = 0 # We must have the same number of elements # in both lists if len(list_1[0]) == len(list_2[0]): for i in range(len(list_1[0])): # FInd xi - x mean * yi - y mean numerator += (list_1[0][i] - list_1_mean) * (list_2[0][i] - list_2_mean) denominator = len(list_1[0]) - 1 return numerator / denominator else: print("Error : You must have the same number of values in both lists") def correlation_coefficient(*args): list_1 = [i[0] for i in args] list_2 = [i[1] for i in args] # Pass those lists to get their standard deviations list_1_sd = standard_deviation(*list_1[0]) list_2_sd = standard_deviation(*list_2[0]) print(f"L1 SD : {list_1_sd}") print(f"L2 SD : {list_2_sd}") denominator = list_1_sd * list_2_sd # Get the covariance numerator = covariance(*args) return numerator / denominator print(f"Mean : {mean(1, 2, 3, 4, 5)}") print(f"Median : {median(1, 2, 3, 4, 5)}") print(f"Median : {median(1, 2, 3, 4, 5, 6)}") print(f"Mode : {mode(1, 2, 3, 4, 5, 4, 5)}") print(f"Variance : {variance(4, 6, 3, 5, 2)}") print(f"Standard Deviation : {standard_deviation(4, 6, 3, 5, 2)}") print(f"Coefficient Variation (miles): {coefficient_variation(3, 4, 4.5, 3.5)}") print(f"Coefficient Variation (kms): {coefficient_variation(4.828, 6.437, 7.242, 5.632)}") # List that contains market cap in 1st list # and earnings in the 2nd list m_d_list = [[1532, 1488, 1343, 928, 615], [58, 35, 75, 41, 17]] print(f"Stock Covariance : {covariance(m_d_list)}") # Get the Correlation Coefficient print(f"Correlation Coefficient : {correlation_coefficient(m_d_list)}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
Sale ID,Company,Contact,Contact Sex,Contact Age,Contact City,Contact State,Product Company,Product ID,Product Type,Sale Price,Our Cost,Shipping Cost,Profit,Lead Source,Sale Month,Sale Year 1,PNT Designs,Paul Thomas,M,43,Carnegie,OH,HP,M01-F0024,Desktop,479.99,315.50,21.10,143.39,Website,January,2018 2,Ace Manufacturing,Margo Simms,F,37,Larimer,WV,HP,GT13-0024,Desktop,1249.99,998.00,21.10,230.89,Flyer 4,January,2018 3,Sammie's,Sam Stine,M,26,Pittsburgh,PA,Dell,I3670,Desktop,649.99,510.25,21.10,118.64,Website,February,2018 4,One Moe Time,Moe Eggert,M,35,St. Clair,PA,Dell,I3593,Laptop,399.99,310.50,17.40,72.09,Website,March,2018 5,Get Going Gym,Jessica Elk,F,55,Pittsburgh,PA,HP,15M-ED,Laptop,699.99,584.50,17.40,98.09,Flyer 4,March,2018 6,ANX Trucking,Sally Struthers,F,45,St. Clair,PA,HP,GT13-0024,Desktop,1249.99,998.00,21.10,230.89,Flyer 2,April,2018 7,Samms Grooming,Michelle Samms,F,46,Carnegie,OH,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Email,May,2018 8,Roberts Produce,Mick Roberts,M,23,Lawrenceville,OH,Apple,MY2J2LL,Tablet,999.99,845.00,8.30,146.69,Website,July,2018 9,Klondike Dairy,Ed Klondike,M,52,Carnegie,OH,Lenovo,81TC00,Laptop,649.99,510.25,17.40,122.34,Email,July,2018 10,Jones Manufacturing,Phil Jones,M,56,Larimer,WV,HP,M01-F0024,Desktop,479.99,315.50,21.10,143.39,Flyer 2,August,2018 11,James Cycles,Rick James,M,49,Pittsburgh,PA,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Flyer 3,November,2018 12,Weight Stalkers,Sue Etna,F,54,Banksville,OH,HP,GT13-0024,Desktop,1249.99,998.00,21.10,230.89,Flyer 2,November,2018 13,Case Solutions,Jason Case,M,57,Pittsburgh,PA,Lenovo,81TC00,Laptop,649.99,510.25,17.40,122.34,Email,November,2018 14,Doug's House,Doug Johnson,M,51,Pittsburgh,PA,Dell,I3670,Desktop,649.99,510.25,21.10,118.64,Website,December,2018 15,Helms Manufacturing,Andy Sands,M,56,Lawrenceville,OH,Apple,MY2J2LL,Tablet,999.99,845.00,8.30,146.69,Flyer 1,December,2018 16,Collins Advertising,Kim Collins,F,49,Pittsburgh,PA,Dell,I3593,Laptop,399.99,310.50,17.40,72.09,Flyer 2,January,2019 17,Owens & Sons,Edna Sanders,F,46,Lawrenceville,OH,HP,15M-ED,Laptop,699.99,584.50,17.40,98.09,Email,February,2019 18,Samms Grooming,Michelle Samms,F,46,Banksville,NY,Apple,MY2J2LL,Tablet,999.99,845.00,8.30,146.69,Website,March,2019 19,Roberts Produce,Mick Roberts,M,23,Pittsburgh,PA,Dell,I3593,Laptop,399.99,310.50,17.40,72.09,Flyer 4,March,2019 20,ANX Trucking,Sally Struthers,F,45,Banksville,NY,Lenovo,81TC00,Laptop,649.99,510.25,17.40,122.34,Website,April,2019 21,Case Solutions,Jason Case,M,57,Pittsburgh,PA,HP,M01-F0024,Desktop,479.99,315.50,21.10,143.39,Flyer 4,May,2019 22,Doug's House,Doug Johnson,M,51,Pittsburgh,PA,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Website,August,2019 23,PNT Designs,Paul Thomas,M,43,Carnegie,OH,Lenovo,81TC00,Laptop,649.99,510.25,17.40,122.34,Website,August,2019 24,Ace Manufacturing,Margo Simms,F,37,Larimer,WV,ASUS,Q526FA,Laptop,1049.99,889.50,17.40,143.09,Flyer 4,November,2019 25,Samms Grooming,Michelle Samms,F,46,Banksville,NY,Dell,I3670,Desktop,649.99,510.25,21.10,118.64,Flyer 2,November,2019 26,Roberts Produce,Mick Roberts,M,23,Pittsburgh,PA,ASUS,Q526FA,Laptop,1049.99,889.50,17.40,143.09,Email,November,2019 27,Klondike Dairy,Ed Klondike,M,52,Carnegie,OH,ASUS,Q526FA,Laptop,1049.99,889.50,17.40,143.09,Website,December,2019 28,One Moe Time,Moe Eggert,M,35,St. Clair,PA,HP,15M-ED,Laptop,699.99,584.50,17.40,98.09,Email,December,2019 29,Get Going Gym,Jessica Elk,F,55,Pittsburgh,PA,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Flyer 2,December,2019 30,Jones Manufacturing,Phil Jones,M,56,Larimer,WV,HP,M01-F0024,Desktop,479.99,315.50,21.10,143.39,Flyer 2,January,2020 31,James Cycles,Rick James,M,49,Pittsburgh,PA,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Flyer 1,January,2020 32,Weight Stalkers,Sue Etna,F,54,Banksville,OH,HP,GT13-0024,Desktop,1249.99,998.00,21.10,230.89,Flyer 2,February,2020 33,Collins Advertising,Kim Collins,F,49,Pittsburgh,PA,Dell,I3593,Laptop,399.99,310.50,17.40,72.09,Flyer 2,March,2020 34,Owens & Sons,Edna Sanders,F,46,Lawrenceville,OH,HP,15M-ED,Laptop,699.99,584.50,17.40,98.09,Email,March,2020 35,Samms Grooming,Michelle Samms,F,46,Banksville,NY,Apple,MY2J2LL,Tablet,999.99,845.00,8.30,146.69,Website,April,2020 36,ANX Trucking,Sally Struthers,F,45,Banksville,NY,Lenovo,81TC00,Laptop,649.99,510.25,17.40,122.34,Website,April,2020 37,Case Solutions,Jason Case,M,57,Pittsburgh,PA,HP,M01-F0024,Desktop,479.99,315.50,21.10,143.39,Flyer 4,April,2020 38,Doug's House,Doug Johnson,M,51,Pittsburgh,PA,ASUS,GA401IV,Laptop,1349.99,1152.25,17.40,180.34,Website,May,2020 39,One Moe Time,Moe Eggert,M,35,St. Clair,PA,Dell,I3593,Laptop,399.99,310.50,17.40,72.09,Website,May,2020 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import csv import statistics as stats # Pulls data on sales from a csv file with open("computersales.csv", newline="") as csv_file: reader = csv.reader(csv_file) sales_data = list(reader) def get_string_data_from_csv(index): # Will hold column data pulled from CSV file data_list = [0]*40 # Dictionary that will hold string and count # of that string in the list data_dict = {} # Get items in question from CSV file for i in range(1, len(sales_data)): data_list[i] = sales_data[i][index] # Delete 1st index with no data del data_list[0] # Convert to a set to get only unique values data_set = set(data_list) unique_list = list(data_set) for i in range(0, len(unique_list)): # Get number of times item shows in the list # using string stored in unique_list num_of_items = data_list.count(unique_list[i]) # Add key and value to the dictionary data_dict[unique_list[i]] = num_of_items return data_dict def get_key_profit_list(index, data_dict): profit_dict = {} for key in data_dict.keys(): # Create dictionary key with a list profit_dict[key] = [] for i in range(1, len(sales_data)): if key == sales_data[i][index]: # add profit to list in dictionary profit_dict.setdefault(key, []).append(float(sales_data[i][13])) return profit_dict def get_profit_mean_category_dict(data_dict): # Holds dict key and mean profit for each key profit_mean_dict = {} # Creates a dict with key and mean value for each category for key in data_dict.keys(): profit_mean_dict[key] = stats.mean(*data_dict[key]) return profit_mean_dict # NEW Get variance for all categories def get_standard_deviation(data_dict): # Holds dict key and mean profit for each key sd_dict = {} # Creates a dict with key and mean value for each category for key in data_dict.keys(): sd_dict[key] = stats.standard_deviation(*data_dict[key]) return sd_dict # NEW Gets coefficient variation for all categories def get_coefficient_variation(data_dict): # Holds dict key and mean profit for each key cv_dict = {} # Creates a dict with key and mean value for each category for key in data_dict.keys(): cv_dict[key] = stats.coefficient_variation(*data_dict[key]) return cv_dict def get_mean_profit_data(title, index): print(title + " Data") category_dict = get_string_data_from_csv(index) # Print number in each category print(category_dict) # Print categories with all profits as a list print(f"{title} Profit List : {get_key_profit_list(index, category_dict)}") # Print categories with the mean of profits print(f"{title} Profit Mean : {get_profit_mean_category_dict(get_key_profit_list(index, category_dict))}") # NEW Get Variance print(f"{title} Standard Deviation : {get_standard_deviation(get_key_profit_list(index, category_dict))}") # NEW Get Coefficient Variation print(f"{title} Coefficient Variation : {get_coefficient_variation(get_key_profit_list(index, category_dict))}\n") # Gets sum of categories, sales per category and # mean profits for categories get_mean_profit_data("Sex", 3) get_mean_profit_data("State", 6) get_mean_profit_data("Product Company", 7) get_mean_profit_data("Product Type", 9) get_mean_profit_data("Lead Source", 14) get_mean_profit_data("Sale Month", 15) get_mean_profit_data("Sale Year", 16) # Gets the index I'm searching in the CSV file along with # an array with the array steps from 1st max on up. # Example [10, 20, 30] would grab ranges 0-10, 20-29, 30-100 def get_range_data(index, max_rng_list): # Creates the dictionary with dynamically created keys that define # the ranges I want to grab # If supplied with [29, 39] it makes {'rng_0_29': 0, 'rng_30_39':0} range_dict = {} range_index = 0 for i in range(0, len(max_rng_list)): rng_key = "rng_" + str(range_index) + "_" + str(max_rng_list[i]) range_dict[rng_key] = 0 range_index = int(max_rng_list[i]+1) # Cycle through the dictionary keys and get the high and low # range for key in range_dict.keys(): rng_list = key.split("_") low_range = rng_list[1] high_range = rng_list[2] # Cycle through data in the supplied index while searching for # matches in range defined in the dictionary keys for i in range(1, len(sales_data)): if int(low_range) < int(sales_data[i][index]) <= int(high_range): range_dict[key] += 1 return range_dict # Define the maximums I want for each range to search for # in sales_data my_list = [29, 39, 49, 80] age_dict = get_range_data(4, my_list) print(age_dict) def get_range_profit_list(index, data_dict): profit_dict = {} for key in data_dict.keys(): # Create dictionary key with a list profit_dict[key] = [] # Get range data used to search in sales_data rng_list = key.split("_") low_range = rng_list[1] high_range = rng_list[2] for i in range(1, len(sales_data)): if int(low_range) < int(sales_data[i][index]) <= int(high_range): # add profit to list in dictionary profit_dict.setdefault(key, []).append(float(sales_data[i][13])) return profit_dict def get_range_profit_mean_category_dict(index, data_dict): # Holds dict key and mean profit for each key profit_mean_dict = {} # Creates a dict with key and mean value for each category for key in data_dict.keys(): # Create dictionary key with a list profit_mean_dict[key] = [] # Get range data used to search in sales_data rng_list = key.split("_") low_range = rng_list[1] high_range = rng_list[2] for i in range(1, len(sales_data)): if int(low_range) < int(sales_data[i][index]) <= int(high_range): # add profit to list in dictionary profit_mean_dict.setdefault(key, []).append(float(sales_data[i][13])) # Get mean of list and store the mean of the # list instead of the list of values profit_mean_dict[key] = stats.mean(*profit_mean_dict[key]) return profit_mean_dict print(get_range_profit_list(4, age_dict)) print(get_range_profit_mean_category_dict(4, age_dict)) # NEW Gets all data in the list for the chosen index def get_category_list(index): data_list = [0] * 40 # Get all values for provided index for i in range(1, len(sales_data)): data_list[i] = sales_data[i][index] # Delete 1st index with no data del data_list[0] return data_list # Gets profit list in order and not sorted profit_list = [0] * 40 del profit_list[0] profit_list = get_category_list(13) print(profit_list) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import statistics as stats # A Probability Distribution finds the probability of different # outcomes * A coin flip has a probability distribution of .5 * # A die roll has a probability distribution of 1/6 or .167 * # When you sum all probabilities you get a value of 1. # You see here the probabilities of all die rolls with 2 die * # A Relative Frequency Histogram charts out all those probabilities. # Pay particular attention to the shape of that chart because... # Next we'll talk about a Normal Distribution. A Normal Distribution # is when data forms a bell curve. Also 1 Standard Deviation is # representative of 68% of the data. 2 standard deviations cover 95% # and 3 covers 99.7%. # To have a Normal Distribution the Mean = Median and Mode * Also # 50% of values are both less than and greater that the mean. # A Standard Normal Distribution has a mean of zero and a standard # deviation of 1. If we calculate the mean we see it is 4. If we # calculate the standard deviation that comes to 1.58. # We can turn this into a Standard Normal Distribution by subtracting # the mean from each value and divide by the standard deviation. If # we do that we get the chart here. dice_list = [1, 2, 4, 4, 4, 5, 5, 5, 6] print(f"Sum : {sum(dice_list)}") print(f"Mean : {stats.mean(*dice_list)}") print(f"Standard Deviation : {stats.standard_deviation(*dice_list)}") normalized_list = stats.normalize_list(*dice_list) print(f"Normal List : {normalized_list}") print(f"Normal Mean : {stats.mean(*normalized_list)}") print(f"Normal Standard Deviation : {stats.standard_deviation(*normalized_list)}") # The Central Limit Theorem states that the more samples you take # the closer you get to the mean. Also the distribution will # approximate the Normal Distribution * As you can see as the sample # size increases the standard deviation decreases. # The Sample Error measures the accuracy of an estimate. To find # it divide standard deviation by the square root of the sample # size. Again notice as the sample size increases the Standard # Error decreases. print(f"Standard Error : {stats.sample_error(*normalized_list)}") # The Z Score gives us the value in standard deviations for the # percentile we want * For example if we want 95% of the data # it tells us how many standard deviations are required. * The # formula asks for the length from the mean to x and divides by # the standard deviation. # This will make more sense with an example. Here is a Z Table. # If we know our mean is 40.8, the standard deviation is 3.5 and # we want the area to the left of the point 48 we perform our # calculation to get 2.06. * We then find 2.0 on the left of # the Z Table * and .06 on the top. * This tells us that the # area under the curve makes up .98030 of the total. # Now let's talk about Confidence Intervals. Point Estimates # are what we have largely used, but they can be inaccurate. # An alternative is an interval * For example if we had 3 # sample means as you see here we could instead say that # they lie in the interval of (5,7) * We then state how # confident we are in the interval. Common amounts are 90%, # 95% and 99%. For example if we have a 90% confidence that # means we expect 9 out of 10 intervals to contain the mean * # Alpha represents the doubt we have which is 1 minus the # confidence. # Now I'll show you how to calculate a confidence interval. We # need a sample mean, alpha, standard deviation and the # number of samples represented by lowercase n * Here the value # after the plus or minus represents the Margin of Error. # Now I'll walk you through an example where we calculate the # probable salary we would receive if we became a player for # the Houston Rockets. We have the mean salary * We want our results # to be confident to 95% * We get alpha from confidence * # Critical Probability is calculated by subtracting alpha # divided by 2 from 1. * Then we look up the Z Code in a table. # If we search for .975 we find that the Z Code is 1.96. * # We find our standard deviation and then plug in our values. * # And when we do we find our Confidence Interval salary. # Calculate the Houston Rockets salary confidence interval salary_list = [38178000, 37800000, 14057730, 11301219, 8349039, 3540000, 2564753, 2564753, 2174318, 2028594, 1845301, 903111, 8111447, 695526, 568422] # # Formula (x,y) = x̄ ± Z(α/2) * σ/√n # # x̄ : Sample Mean # # α : Alpha (1 - Confidence) # # σ : Standard Deviation # # n : Sample Size # get_confidence_interval(sample_mean, alpha, sd, sample_size) sample_mean = stats.mean(*salary_list) print(f"Mean {sample_mean}") confidence = .95 standard_deviation = stats.standard_deviation(*salary_list) print(f"Standard Deviation {standard_deviation}") stats.get_confidence_interval(sample_mean, confidence, standard_deviation, 15) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import statistics as stats import random # Student's T Distributions are used when your sample # size is small and/or the population variance is # unknown # A T Distribution looks like a Normal Distribution # with fatter tails meaning a wider dispersion of variables # When we know the standard deviation we can compute the # Z Score and use the Normal Distribution to calculate # probabilities # The formula is t = (x̅ - μ) / (s/√n), where x̅ is the # sample mean, μ is the population mean, s is the # Standard Deviation of the sample and n is the sample # size # In this example let's say a manufacturer is promising # break pads will last for 65,000 km with a .95 # confidence level. * Our sample mean is 62,456.2 # * The standard deviation is 2418.4 # Degrees of freedom is the number of samples taken # minus 1. If we take 30 samples that means degrees of # freedom equals 29. # If we know confidence is .95 then we subtract .95 from # 1 to get .05. If we look up 29 and .05 in the T Table # we get a value of 2.045 # If we plug our values into our formula we find the # interval for our sample. # Generate Random List between 58000 and 68000 # break_pad_kms = [random.randint(58000, 68000) for i in range(30)] break_pad_kms = [58500, 58700, 62800, 57220, 62750, 59370, 57720, 60920, 61910, 59260, 63550, 60520, 58710, 57340, 60660, 57750, 60430, 60050, 62970, 58870] stats.get_t_confidence_interval(.95, *break_pad_kms) # When used with previous formula you can see results # are similar stats.get_confidence_interval(60000, .95, 1988.1782, 20) # Let's talk about the difference between Dependent & # Independent Samples. With Dependent samples 1 sample # can be used to determine the other samples results. # You'll often see examples of cause & effect or pairs # of results. An example would be if I roll a die, what # is the probability that it is odd. Or, if subjects # lifted dumbbells each day and recorded results before # and after the week what did we find? # Independent Samples are those in which samples from # 1 population has no relation to another group. Normally # you'll see the word random and not cause and effect # terms. An example is blood samples are taken from 10 # random people that are tested at lab A. 10 other random # samples are tested from lab B. Or, Give 1 random group # a drug and another a placebo and test the results. # When thinking about probabilities we first must create # a hypothesis. A hypothesis is an educated guess that # you can test * If you say restaurants in Los Angeles # are expensive that is a statement and not a hypothesis # because there is nothing to test that against * If # however we say restaurants in Los Angeles are expensive # versus restaurants in Pittsburgh we can test for that. # * The technical name for the hypothesis we are testing # is the Null Hypothesis. An example is a test to see if # average used car prices fall between $19,000 and # $21,000 * The Alternative Hypothesis includes all # other possible prices in this example. That would be # values from $0 to $19,000 and then from $21,000 and # higher. # When you test a hypothesis the probability of # rejecting the Null Hypothesis when it is actually # true is called the Significance Level represented by # α. * Common αs include .01, .05 and .1. * Previously # we talked about Z Tables. If the sample mean and # the population mean are equal then Z equals 0. * If # we create a bell graph and we know that α is .05 # then we know that the rejection for the Null # Hypothesis is found at α/2 or .025. * If we use a # Z Table and we know µ is 0 and α/2 = .025 we find that # the rejected region is less than -1.96 and greater # than 1.96. (This is known as a 2 sided test) # * With 1 sided tests for example if I say I think # used car prices are greater than $21,000, the # Null Hypothesis is everything to the right of the # Z Code for α instead of α/2 which is 1 - .05 = .95 # In the Z Table that is -1.65. # When it comes to hypothesis errors there are 2 types # Type I Errors called False Positives, refer to a # rejection of a true null hypothesis. The probability # of making this error is alpha. * Then you have Type # II errors called false negatives which is when you # accept a false null hypothesis. This error is # normally caused by poor sampling. The probability # of making this error is represented by Beta # * The goal of hypothesis testing is to reject # a false null hypothesis which has a probability # of 1 - Beta. You increase the power of the test by # increasing the number of samples. # This example will clear hypothesis errors up. If you # believe the null hypothesis is that there is no # reason to apply for a job because you won't get it. # You can call this the status quo belief. * If you then # don't apply and the null hypothesis was correct # you'd see that your decision was correct. * Also if # you rejected the null hypothesis and applied and you # got the job you would see again that you made the # correct decision. * However if the hypothesis was # correct and you applied that would be an example of a # Type I Error. * And again if you choose not to apply # but the hypothesis was false this would be an example # of a Type II Error |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import statistics as stats # I want to calculate if my sample is higher or # lower than the population mean. To find out I # need a 2 sided test. The population mean is the # Null Hypothesis. * That Null Hypothesis is that # break pads should last for 64,000 kms. population_mean = 64000 # * Here is my Sample break pad data break_pad_kms = [58500, 58700, 62800, 57220, 62750, 59370, 57720, 60920, 61910, 59260, 63550, 60520, 58710, 57340, 60660, 57750, 60430, 60050, 62970, 58870] # We calculate our sample mean * standard deviation # * sample size, * Sample Error # * We need to standardize our means so we can compare # them even if they have different standard deviations. # * We standardize our variable by subtracting # the mean and then divide by the standard deviation. # When we do this we normalize our data meaning we # get a mean of zero and a standard deviation of 1. # Z = (x̅ - μ0) / Sample Error # Sample Error = standard_deviation(*args) / # (math.sqrt(len(args))) # * We then get the absolute value of this result sample_mean = stats.mean(*break_pad_kms) sample_sd = ("%.2f" % stats.standard_deviation(*break_pad_kms)) sample_error = ("%.2f" % stats.sample_error(*break_pad_kms)) print(f"Mean : {sample_mean}") print(f"Standard Deviation : {sample_sd}") print(f"Sample Size : {len(break_pad_kms)}") print(f"Sample Error : {sample_error}") z_score = (sample_mean - population_mean) / float(sample_error) print(f"Z Score : {z_score}") # If my confidence is .95 α is .05 and since we are # using a 2 sided test we use α/2 = .025. * If we # subtract .025 from 1 we get .9750. * If we look up # .9750 on the Z Table we get a Z Score of 1.96. # * We now compare the absolute value of the z score we # calculated before which is 8.99 to the Critical # Value which is 1.96. If 8.99 is greater than 1.96 # which it is we reject the Null Hypothesis. To be # more specific we are saying at .95 confidence level # we reject that the break pads have an average # lifecycle of 64,000 km. # The P Value is the smallest level of significance # at which we can reject the Null Hypothesis. * In # our example we found a Z Score of 8.99 which isn't # on our chart. * Let's say instead that the Null # Hypothesis was 61,750 kms. That would mean the # hypothesis would be correct at 1 - .99996 = .00004 # significance. So here the P Value for a 1 sided # test is .00004. For a 2 sided test we multiply # .00004 by 2 = .00008. population_mean = 61750 z_score = (sample_mean - population_mean) / float(sample_error) print(f"Z Score : {z_score}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import statistics as stats import random from functools import reduce # Regression Analysis is used to examine the # relationship between 2 or more variables. # * You use it to determine which factors matter # the most * and which factors can be ignored. # * The Dependent Variable is what you want to # better understand. Independent Variables are # what does or doesn't effect the Dependent # Variable. # * For example if we had a movie theater # and we wanted to improve customer satisfaction # customer satisfaction would be the dependent # variable. The independent variables that effect # it may be sound quality, picture quality, the # seat comfort, the quality of the food or price. # Simple Linear Regression Model # y = β0 + β1 * x1 + ε # y : Dependent Variable (What we are trying to predict) ŷ is used for predicted values # x1, ... xn : Independent Variables # β0 : A constant that represents the minimum # value of y (Y Intercept) # β1 : The coefficient that quantifies the effect # of the independent variable x1 on y # ε : Represents estimation errors which would # be the difference between the observed # value of y and the value for y that the regression # predicts. The average value should be 0. # We do this like we do with any linear equation. We find the slope and then # b0 which is the Y intercept. We are basically averaging the sample points # to our line. This is called the regression line. We note that it is a # regression line by using y hat instead of y. # Here is the formula for calculating b1. We sum all values of x minus # their means and the same for all values of y. We square the results # to eliminate negative values. We then divide by the sum of x minus # the mean again squared. Now we have the slope. To calculate the # y intercept or b0 I find y bar - slope * x bar. # Here is an example on how you'd calculate the linear regression # line. Get the means for x & y. Sum the product of each value of # x minus the mean and the same for y. Get the sum of all values of # x minus the mean squared. Then find the slope by dividing those # values to get 5.958. The calculate the value for the y intercept. # Then you can create the formula for the line which you can see # to the right. # How do we find out if our regression line is a good fit for our # data? We do that with something we have already covered which is # the correlation coefficient. Remember that the correlation # coefficient calculates whether the values of x and y are # related (correlated). We calculate it by finding the covariance # of X & Y and then divide by the product of the standard deviations # of X & Y. If the value is close to 1 then the data is highly # correlated which means our regression line should have an easy # to modeling the data. # Let's work through an example where we find the correlation # coefficient. First we must calculate the covariance for all # x and y values which equals 1733.09. # Now that we have the covariance we can divide it by the # standard deviation of x multiplied by the standard deviation # of y. When we do that we get .9618. Since .9618 is so close to # 1 we know that are linear regression line will be tightly # matched to the data. temp_sales_day_list = [[37, 292], [40, 228], [49, 324], [61, 376], [72, 440], [79, 496], [83, 536], [81, 556], [75, 496], [64, 412], [53, 324], [40, 320]] temp_sales_sep_list = [[37, 40, 49, 61, 72, 79, 83, 81, 75, 64, 53, 40], [292, 228, 324, 376, 440, 496, 536, 556, 496, 412, 324, 320]] print("Linear Regression List") print(stats.get_linear_regression_list(temp_sales_day_list)) print(f"Correlation Coefficient : {stats.correlation_coefficient(temp_sales_sep_list)}") print() # Generates a random list that adds up to the sum provided n # using the defined number of values num_terms def random_list_defined_sum(goal_sum, num_values): # Generate a random sample with values in the range # between 1 to the target sum and add 0 and the goal # sum to the endpoints of the list a = random.sample(range(1, goal_sum), num_values) + [0, goal_sum] # Sort the list list.sort(a) # If you subtract successive values in the list the resulting # list will have the defined goal sum return [a[m+1] - a[m] for m in range(len(a) - 1)] # Generates a random list that will have an average value # of expected_avg, a defined list length list_length, will have a # minimum value a and a maximum value of b def random_list_defined_avg(expected_avg, list_length, a, b): while True: # Generate random list with values between min and max rand_list = [random.randint(a, b) for i in range(list_length)] # Find averages for the list until we get our target average # and then return the list avg = reduce(lambda x, y: x + y, rand_list) / len(rand_list) if avg == expected_avg: return rand_list # Define list of average temperatures I'll use for testing # linear regression temp_list = [37, 40, 49, 61, 72, 79, 83, 81, 75, 64, 53, 40] # Used to generate fake sales with defined sums for testing # linear regression sales_list = [292, 228, 324, 376, 440, 496, 536, 556, 496, 412, 324, 320] # List that will hold all temperature values generated which # will be used to calculate the Correlation Coefficient gen_temp_list = [] # List that will hold all the generated sales for each day # that I'll use to calculate the Correlation Coefficient gen_sales_list = [] # Will hold both gen_temp_list and gen_sales_list gen_sales_temp_list = [] # Will generate all of the fake temperature and sales data # that I can use to demonstrate both linear regression and # how Correlation Coefficient can define if a linear # regression is a good fit for our sample points def get_temp_sales_list(): new_list = [] # Generate 12 months of temp and sales lists for i in range(12): new_temp_list = random_list_defined_avg(temp_list[i], 4, temp_list[i]-2, temp_list[i]+2) new_sales_list = random_list_defined_sum(sales_list[i], 4) # Generate 28 days worth of temp and sales lists for # each month for j in range(4): # Make individual lists for just daily temp and # sales and append those lists day_data_list = [new_temp_list[j], new_sales_list[j]] new_list.append(day_data_list) # Add to list of just temps and sales gen_temp_list.append(new_temp_list[j]) gen_sales_list.append(new_sales_list[j]) return new_list ice_cream_list = get_temp_sales_list() lr_list = stats.get_linear_regression_list(ice_cream_list) # print(lr_list) # print(gen_temp_list) # print(gen_sales_list) # Contains x and y values for lr_list x_y_ic_list = [gen_temp_list, gen_sales_list] # print(f"Ice Cream Correlation Coefficient : {stats.correlation_coefficient(x_y_ic_list)}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import statistics as stats sample_regression_list = [[292, 228, 324, 376, 440, 496, 536, 556, 496, 412, 324, 320], [256, 273, 327, 399, 464, 506, 530, 518, 482, 416, 351, 273]] # Root Mean Squared Deviation is the measure of the # differences between sample points and the regression # line. We are using all these formulas to better understand # how well our regression linear equation is estimating # the data. So we find the residual for each data point. # The residual is represented by the black lines that go # from the data point to the regression line. If each # residual is e, we take the sum of all residuals squared # divided by the number of samples minus 1. We have the # table with both the samples and the regression line # so I'll find the Root Mean Squared Deviation. # If I calculate e by subtracting the value of my regression # line from the sample y. I then square all those values # and find their sum. If I divide by the number of samples # minus 1 and then find the square root I get 28.86. That # means for 1 standard deviation which makes up for 68% # of all samples, our regression line will be off at most # plus or minus 28.86. # We could then add and subtract 28.86 and create 2 more # lines that will capture 68% of all values. * We could # then add in another line on the top and bottom and # capture 95% of all points. print(f"Root Mean Squared Deviation : {stats.root_mean_squared_deviation(sample_regression_list)}") # Now to finish up we'll talk about Chi (Kai) Square Tests. # Before you can perform the tests you must meet the # conditions that data is random, large (each cell must # be > 5) and independent (Sample with Replacement or # 10% rule). Chi square test of homogeneity is used when # you want to look at the relationship between different # categories of variables. This is used when you sample # from 2 groups and want to compare their probability # distributions. # What we are trying to find is if age has an effect on # peoples preferences for favorite sport. Our null # hypothesis is that age doesn't effect favorite sport # and the alternative is that it does. # If we calculate the percentages for all columns we # get these results. Now to prove the null hypothesis # we should expect that 25% of 18 to 29 year olds should # prefer the NBA for example. Also the percentages should # work out for all other sports organizations. The easiest # way to calculate the expected value for each cell in the # chart is to multiply the cell column value by the row # total and then divide by the total number of people. So # the expected value for 18 to 29 year olds that like the # NBA is 66 * 35 / 142 = 16.3 # I calculated the expected value for each cell. You can # see that the row column totals are still the same. The # Chi square formula is χ2 = Σ(observed - expected)2 / # expected. If we perform this calculation we get 11.59. # The larger this value the more likely these values # effect each other. We look up this value in a Chi square # table, but we also must have the degrees of freedom for # our data. You get that by multiplying # of columns -1 by # number of rows minus 1 or 3 * 1 or 3. # Now we find our degrees of freedom and the closest match to # 11.59 in a Chi Square Test table. When we do we find that # we have a 99% confidence that age doesn't # effect a persons favorite sport. fav_sport_lists = [[23, 12, 24, 7], [12, 13, 45, 6]] stats.chi_square_test(fav_sport_lists) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 |
import math import re pos_z_code_list = [['.50000', '.50399', '.50798', '.51197', '.51595', '.51994', '.52392', '.52790', '.53188', '.53586'], ['.53983', '.54380', '.54776', '.55172', '.55567', '.55962', '.56356', '.56749', '.57142', '.57535'], ['.57926', '.58317', '.58706', '.59095', '.59483', '.59871', '.60257', '.60642', '.61026', '.61409'], ['.61791', '.62172', '.62552', '.62930', '.63307', '.63683', '.64058', '.64431', '.64803', '.65173'], ['.65542', '.65910', '.66276', '.66640', '.67003', '.67364', '.67724', '.68082', '.68439', '.68793'], ['.69146', '.69497', '.69847', '.70194', '.70540', '.70884', '.71226', '.71566', '.71904', '.72240'], ['.72575', '.72907', '.73237', '.73565', '.73891', '.74215', '.74537', '.74857', '.75175', '.75490'], ['.75804', '.76115', '.76424', '.76730', '.77035', '.77337', '.77637', '.77935', '.78230', '.78524'], ['.78814', '.79103', '.79389', '.79673', '.79955', '.80234', '.80511', '.80785', '.81057', '.81327'], ['.81594', '.81859', '.82121', '.82381', '.82639', '.82894', '.83147', '.83398', '.83646', '.83891'], ['.84134', '.84375', '.84614', '.84849', '.85083', '.85314', '.85543', '.85769', '.85993', '.86214'], ['.86433', '.86650', '.86864', '.87076', '.87286', '.87493', '.87698', '.87900', '.88100', '.88298'], ['.88493', '.88686', '.88877', '.89065', '.89251', '.89435', '.89617', '.89796', '.89973', '.90147'], ['.90320', '.90490', '.90658', '.90824', '.90988', '.91149', '.91309', '.91466', '.91621', '.91774'], ['.91924', '.92073', '.92220', '.92364', '.92507', '.92647', '.92785', '.92922', '.93056', '.93189'], ['.93319', '.93448', '.93574', '.93699', '.93822', '.93943', '.94062', '.94179', '.94295', '.94408'], ['.94520', '.94630', '.94738', '.94845', '.94950', '.95053', '.95154', '.95254', '.95352', '.95449'], ['.95543', '.95637', '.95728', '.95818', '.95907', '.95994', '.96080', '.96164', '.96246', '.96327'], ['.96407', '.96485', '.96562', '.96638', '.96712', '.96784', '.96856', '.96926', '.96995', '.97062'], ['.97128', '.97193', '.97257', '.97320', '.97381', '.97441', '.97500', '.97558', '.97615', '.97670'], ['.97725', '.97778', '.97831', '.97882', '.97932', '.97982', '.98030', '.98077', '.98124', '.98169'], ['.98214', '.98257', '.98300', '.98341', '.98382', '.98422', '.98461', '.98500', '.98537', '.98574'], ['.98610', '.98645', '.98679', '.98713', '.98745', '.98778', '.98809', '.98870', '.98899', '.98928'], ['.98956', '.98983', '.99010', '.99036', '.99061', '.99086', '.99111', '.99134', '.99158', '.99180'], ['.99202', '.99224', '.99245', '.99266', '.99286', '.99305', '.99324', '.99343', '.99361', '.99379'], ['.99396', '.99413', '.99430', '.99446', '.99461', '.99477', '.99492', '.99506', '.99520', '.99534'], ['.99547', '.99560', '.99573', '.99585', '.99598', '.99609', '.99621', '.99632', '.99643', '.99653'], ['.99664', '.99674', '.99683', '.99693', '.99702', '.99711', '.99720', '.99728', '.99736', '.99744'], ['.99752', '.99760', '.99767', '.99774', '.99781', '.99788', '.99795', '.99801', '.99807', '.99813'], ['.99819', '.99825', '.99831', '.99836', '.99841', '.99846', '.99851', '.99856', '.99861', '.99865'], ['.99869', '.99874', '.99878', '.99882', '.99886', '.99889', '.99893', '.99896', '.99900', '.99903'], ['.99906', '.99910', '.99913', '.99916', '.99918', '.99921', '.99924', '.99926', '.99929', '.99931'], ['.99934', '.99936', '.99938', '.99940', '.99942', '.99944', '.99946', '.99948', '.99950', '.99952'], ['.99953', '.99955', '.99957', '.99958', '.99960', '.99961', '.99962', '.99964', '.99965', '.99966'], ['.99968', '.99969', '.99970', '.99971', '.99972', '.99973', '.99974', '.99975', '.99976', '.99977'], ['.99978', '.99978', '.99979', '.99980', '.99981', '.99981', '.99982', '.99983', '.99983', '.99984'], ['.99985', '.99985', '.99986', '.99986', '.99987', '.99987', '.99988', '.99988', '.99989', '.99989'], ['.99990', '.99990', '.99990', '.99991', '.99991', '.99992', '.99992', '.99992', '.99992', '.99993'], ['.99993', '.99993', '.99994', '.99994', '.99994', '.99994', '.99995', '.99995', '.99995', '.99995'], ['.99995', '.99996', '.99996', '.99996', '.99996', '.99996', '.99996', '.99997', '.99997']] t_table_confidence = ['0.1000', '0.0500', '0.0250', '0.0100', '0.0050', '0.0010', '0.0005'] t_table_list = [[3.078, 6.314, 12.076, 31.821, 63.657, 318.310, 636.620], [1.886, 2.920, 4.303, 6.965, 9.925, 22.326, 31.598], [1.638, 2.353, 3.182, 4.541, 5.841, 10.213, 12.924], [1.533, 2.132, 2.776, 3.747, 4.604, 7.173, 8.610], [1.476, 2.015, 2.571, 3.365, 4.032, 5.893, 6.869], [1.440, 1.943, 2.447, 3.143, 3.707, 5.208, 5.959], [1.415, 1.895, 2.365, 2.998, 3.499, 4.785, 5.408], [1.397, 1.860, 2.306, 2.896, 3.355, 4.501, 5.041], [1.383, 1.833, 2.262, 2.821, 3.250, 4.297, 4.781], [1.372, 1.812, 2.228, 2.764, 3.169, 4.144, 4.587], [1.363, 1.796, 2.201, 2.718, 3.106, 4.025, 4.437], [1.356, 1.782, 2.179, 2.681, 3.055, 3.930, 4.318], [1.350, 1.771, 2.160, 2.650, 3.012, 3.852, 4.221], [1.345, 1.761, 2.145, 2.624, 2.977, 3.787, 4.140], [1.341, 1.753, 2.131, 2.602, 2.947, 3.733, 4.073], [1.337, 1.746, 2.120, 2.583, 2.921, 3.686, 4.015], [1.333, 1.740, 2.110, 2.567, 2.898, 3.646, 3.965], [1.330, 1.734, 2.101, 2.552, 2.878, 3.610, 3.922], [1.328, 1.729, 2.093, 2.539, 2.861, 3.579, 3.883], [1.325, 1.725, 2.086, 2.528, 2.845, 3.552, 3.850], [1.323, 1.721, 2.080, 2.518, 2.831, 3.527, 3.819], [1.321, 1.717, 2.074, 2.508, 2.819, 3.505, 3.792], [1.319, 1.714, 2.069, 2.500, 2.807, 3.485, 3.767], [1.318, 1.711, 2.064, 2.492, 2.797, 3.467, 3.745], [1.316, 1.708, 2.060, 2.485, 2.787, 3.450, 3.725], [1.315, 1.706, 2.056, 2.479, 2.779, 3.425, 3.707], [1.314, 1.703, 2.052, 2.473, 2.771, 3.421, 3.690], [1.313, 1.701, 2.048, 2.467, 2.763, 3.408, 3.674], [1.311, 1.699, 2.045, 2.462, 2.756, 3.396, 3.659], [1.310, 1.697, 2.042, 2.457, 2.750, 3.385, 3.646], [1.303, 1.684, 2.021, 2.423, 2.704, 3.307, 3.551], [1.296, 1.671, 2.000, 2.390, 2.660, 3.232, 3.460], [1.289, 1.658, 1.980, 2.358, 2.617, 3.160, 3.373], [1.282, 1.645, 1.960, 2.326, 2.576, 3.090, 3.291]] def mean(*args): val_sum = sum(args) return val_sum / len(args) def median(*args): if len(args) % 2 == 0: i = round((len(args) + 1) / 2) j = i - 1 return (args[i] + args[j]) / 2 else: k = round(len(args) / 2) return args[k] def mode(*args): # Count how many times values show up in # the list and put it in a dictionary dict_vals = {i: args.count(i) for i in args} # Create a list of keys that have the maximum # number of occurrence in the list max_list = [k for k, v in dict_vals.items() if v == max(dict_vals.values())] return max_list def variance(*args): mean_val = mean(*args) numerator = 0 for i in args: numerator += (i - mean_val) ** 2 denominator = len(args) - 1 try: answer = numerator / denominator except ZeroDivisionError: answer = numerator / 1 return answer def standard_deviation(*args): return math.sqrt(variance(*args)) def coefficient_variation(*args): return standard_deviation(*args) / mean(*args) def covariance(*args): # Use a list comprehension to get all values # stored in the 1st & 2nd list list_1 = [i[0] for i in args] list_2 = [i[1] for i in args] # Pass those lists to get their means list_1_mean = mean(*list_1[0]) list_2_mean = mean(*list_2[0]) numerator = 0 # We must have the same number of elements # in both lists if len(list_1[0]) == len(list_2[0]): for i in range(len(list_1[0])): # FInd xi - x mean * yi - y mean numerator += (list_1[0][i] - list_1_mean) * (list_2[0][i] - list_2_mean) denominator = len(list_1[0]) - 1 return numerator / denominator else: print("Error : You must have the same number of values in both lists") def correlation_coefficient(*args): list_1 = [i[0] for i in args] list_2 = [i[1] for i in args] # Pass those lists to get their standard deviations list_1_sd = standard_deviation(*list_1[0]) list_2_sd = standard_deviation(*list_2[0]) denominator = list_1_sd * list_2_sd # Get the covariance numerator = covariance(*args) print(f"Covariance {numerator}") print(f"list_1_sd {list_1_sd}") print(f"list_2_sd {list_2_sd}") return numerator / denominator def normalize_list(*args): sd_list = standard_deviation(*args) return [(i - mean(*args))/sd_list for i in args] def sample_error(*args): sd_list = standard_deviation(*args) return sd_list / (math.sqrt(len(args))) def get_z_code(z_code_area): # Get index for first closest matching value in Z Table # Define what I'm looking for # Trim the 0 from the area because it isn't used in the # list of table values z_code_area = ("%.3f" % z_code_area).lstrip('0') # Create the Regex with . 3 provided values and any # last 2 digits regex = "\\" + z_code_area + "\d{2}" # Iterate the multidimensional list for i in range(0, len(pos_z_code_list) - 1): for j in range(0, len(pos_z_code_list[0])): # If I find a match if re.search(regex, pos_z_code_list[i][j]): # Combine column and row values into Z Code z_code = float(i * .1 + j * .01) return z_code # Formula (x,y) = x̄ ± Z(α/2) * σ/√n # x̄ : Sample Mean # α : Alpha (1 - Confidence) # σ : Standard Deviation # n : Sample Size def get_confidence_interval(sample_mean, confidence, sd, sample_size): alpha_val = (1 - confidence) critical_probability = 1 - alpha_val / 2 z_code = get_z_code(critical_probability) print("Alpha {:.3f}".format(alpha_val)) print("Critical Probability {:.3f}".format(critical_probability)) print("Z Code {:.3f}".format(z_code)) print("Margin of Error {:.3f}".format((z_code * (sd / math.sqrt(sample_size))))) x = sample_mean - (z_code * (sd / math.sqrt(sample_size))) y = sample_mean + (z_code * (sd / math.sqrt(sample_size))) print(f"Confidence Interval") print("Low : {:.2f}".format(x)) print("High : {:.2f}".format(y)) def get_t_confidence_interval(confidence, *args): # Get alpha for T Table with 4 decimals half_alpha = (1 - confidence) / 2 half_alpha = ("%.4f" % half_alpha) # Get the T Value, sample mean and standard # deviation based on the data if half_alpha in t_table_confidence: alpha_index = t_table_confidence.index(half_alpha) # Subtract 2 instead of 1 because list is 0 based degree_freedom = len(args) - 2 if 1 <= degree_freedom <= 30: t_value = t_table_list[degree_freedom][alpha_index] elif 31 <= degree_freedom <= 60: t_value = t_table_list[31][alpha_index] elif 61 <= degree_freedom <= 120: t_value = t_table_list[32][alpha_index] else: t_value = t_table_list[33][alpha_index] sample_mean = mean(*args) sd = standard_deviation(*args) print("T Distribution") print("Sample Mean : {:.4f}".format(sample_mean)) print("Standard Deviation : {:.4f}".format(sd)) print("T Value : {:.3f}".format(t_value)) # Return high and low distribution low_val = sample_mean - (t_value * (sd / math.sqrt(degree_freedom))) high_val = sample_mean + (t_value * (sd / math.sqrt(degree_freedom))) print("Low : {:.2f}".format(low_val)) print("High : {:.2f}".format(high_val)) # Receives list of x & y samples and returns a # regression list def get_linear_regression_list(*args): # Sum of all x and y values x_sum = 0 y_sum = 0 for i in range(len(args)): for j in range(len(args[i])): x_sum += args[i][j][0] y_sum += args[i][j][1] # Get x & y bar (Means) x_bar = x_sum / len(args[0]) y_bar = y_sum / len(args[0]) numerator = 0 denominator = 0 for i in range(len(args)): for j in range(len(args[i])): x_sums = args[i][j][0] - x_bar denominator += math.pow(x_sums, 2) numerator += x_sums * (args[i][j][1] - y_bar) slope = numerator/denominator y_intercept = y_bar - slope * x_bar # Create multidimensional list of x y values # for the regression line with x being equal # to all values of x in the passed list lr_list = [[0] * 2 for k in range(len(args[0]))] for l in range(len(args)): for m in range(len(args[l])): # Get x value lr_list[m][0] = args[l][m][0] # Calculate y value lr_list[m][1] = int(y_intercept + (slope * args[l][m][0])) # Return the linear regression list return lr_list chi_square_list = [[.45, 1.64, 2.70, 3.84, 5.02, 5.41, 6.63, 7.87, 9.55, 10.82], [1.38, 3.21, 4.60, 5.99, 7.37, 7.82, 9.21, 10.59, 12.42, 13.81], [2.36, 4.64, 6.25, 7.81, 9.34, 9.83, 11.34, 12.83, 14.79, 16.266]] chi_per_list = [.5, .2, .1, .05, .025, .02, .01, .005, .002, .001] def root_mean_squared_deviation(*args): y_sample_list = [i[0] for i in args] y_regression_list = [i[1] for i in args] sample_length = len(args[0][0]) numerator = 0 denominator = sample_length - 1 for j in range(sample_length): difference = args[0][0][j] - args[0][1][j] numerator += math.pow(difference, 2) return math.sqrt(numerator/denominator) def chi_square_test(*args): list_1 = [i[0] for i in args] list_2 = [i[1] for i in args] num_cols = len(args[0][0]) num_rows = len(args[0]) degree_freedom = (num_cols - 1) * (num_rows - 1) col_sum_list = [sum(x) for x in zip(*args[0])] row_sum_list = [sum(x) for x in args[0]] row_sum = sum(row_sum_list) expected_table = [] temp_list =[] for i in range(len(row_sum_list)): for j in range(len(col_sum_list)): temp_list.append(round(row_sum_list[i] * col_sum_list[j] / row_sum)) expected_table.append(temp_list) temp_list = [] chi_num = 0 for m in range(len(list_1[0])): chi_num += math.pow(expected_table[0][m] - list_1[0][m], 2) / expected_table[0][m] for n in range(len(list_2[0])): chi_num += math.pow(expected_table[0][n] - list_2[0][n], 2) / expected_table[0][n] for p in range(9): if chi_num <= chi_square_list[degree_freedom-1][p]: print(f"Confidence : {1 - chi_per_list[p]}") break |
Leave a Reply