diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 46086dd43bfb7ac6611092b48cec4c1d99aeffa2..f1aa0591112a4c5ab15ff4537484218575062626 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -66,6 +66,22 @@
 
 #define SDHCI_VNDR_TUN_CTRL0_0				0x1c0
 #define SDHCI_VNDR_TUN_CTRL0_TUN_HW_TAP			0x20000
+#define SDHCI_VNDR_TUN_CTRL0_START_TAP_VAL_MASK		0x03fc0000
+#define SDHCI_VNDR_TUN_CTRL0_START_TAP_VAL_SHIFT	18
+#define SDHCI_VNDR_TUN_CTRL0_MUL_M_MASK			0x00001fc0
+#define SDHCI_VNDR_TUN_CTRL0_MUL_M_SHIFT		6
+#define SDHCI_VNDR_TUN_CTRL0_TUN_ITER_MASK		0x000e000
+#define SDHCI_VNDR_TUN_CTRL0_TUN_ITER_SHIFT		13
+#define TRIES_128					2
+#define TRIES_256					4
+#define SDHCI_VNDR_TUN_CTRL0_TUN_WORD_SEL_MASK		0x7
+
+#define SDHCI_TEGRA_VNDR_TUN_CTRL1_0			0x1c4
+#define SDHCI_TEGRA_VNDR_TUN_STATUS0			0x1C8
+#define SDHCI_TEGRA_VNDR_TUN_STATUS1			0x1CC
+#define SDHCI_TEGRA_VNDR_TUN_STATUS1_TAP_MASK		0xFF
+#define SDHCI_TEGRA_VNDR_TUN_STATUS1_END_TAP_SHIFT	0x8
+#define TUNING_WORD_BIT_SIZE				32
 
 #define SDHCI_TEGRA_AUTO_CAL_CONFIG			0x1e4
 #define SDHCI_AUTO_CAL_START				BIT(31)
@@ -97,6 +113,8 @@
 struct sdhci_tegra_soc_data {
 	const struct sdhci_pltfm_data *pdata;
 	u32 nvquirks;
+	u8 min_tap_delay;
+	u8 max_tap_delay;
 };
 
 /* Magic pull up and pull down pad calibration offsets */
@@ -136,6 +154,8 @@ struct sdhci_tegra {
 	u32 default_trim;
 	u32 dqs_trim;
 	bool enable_hwcq;
+	unsigned long curr_clk_rate;
+	u8 tuned_tap_delay;
 };
 
 static u16 tegra_sdhci_readw(struct sdhci_host *host, int reg)
@@ -241,6 +261,7 @@ static void tegra210_sdhci_writew(struct sdhci_host *host, u16 val, int reg)
 
 	if (is_tuning_cmd) {
 		udelay(1);
+		sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA);
 		tegra_sdhci_configure_card_clk(host, clk_enabled);
 	}
 }
@@ -722,6 +743,7 @@ static void tegra_sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
 	 */
 	host_clk = tegra_host->ddr_signaling ? clock * 2 : clock;
 	clk_set_rate(pltfm_host->clk, host_clk);
+	tegra_host->curr_clk_rate = host_clk;
 	if (tegra_host->ddr_signaling)
 		host->max_clk = host_clk;
 	else
@@ -770,6 +792,159 @@ static void tegra_sdhci_hs400_dll_cal(struct sdhci_host *host)
 			"HS400 delay line calibration timed out\n");
 }
 
+static void tegra_sdhci_tap_correction(struct sdhci_host *host, u8 thd_up,
+				       u8 thd_low, u8 fixed_tap)
+{
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+	struct sdhci_tegra *tegra_host = sdhci_pltfm_priv(pltfm_host);
+	u32 val, tun_status;
+	u8 word, bit, edge1, tap, window;
+	bool tap_result;
+	bool start_fail = false;
+	bool start_pass = false;
+	bool end_pass = false;
+	bool first_fail = false;
+	bool first_pass = false;
+	u8 start_pass_tap = 0;
+	u8 end_pass_tap = 0;
+	u8 first_fail_tap = 0;
+	u8 first_pass_tap = 0;
+	u8 total_tuning_words = host->tuning_loop_count / TUNING_WORD_BIT_SIZE;
+
+	/*
+	 * Read auto-tuned results and extract good valid passing window by
+	 * filtering out un-wanted bubble/partial/merged windows.
+	 */
+	for (word = 0; word < total_tuning_words; word++) {
+		val = sdhci_readl(host, SDHCI_VNDR_TUN_CTRL0_0);
+		val &= ~SDHCI_VNDR_TUN_CTRL0_TUN_WORD_SEL_MASK;
+		val |= word;
+		sdhci_writel(host, val, SDHCI_VNDR_TUN_CTRL0_0);
+		tun_status = sdhci_readl(host, SDHCI_TEGRA_VNDR_TUN_STATUS0);
+		bit = 0;
+		while (bit < TUNING_WORD_BIT_SIZE) {
+			tap = word * TUNING_WORD_BIT_SIZE + bit;
+			tap_result = tun_status & (1 << bit);
+			if (!tap_result && !start_fail) {
+				start_fail = true;
+				if (!first_fail) {
+					first_fail_tap = tap;
+					first_fail = true;
+				}
+
+			} else if (tap_result && start_fail && !start_pass) {
+				start_pass_tap = tap;
+				start_pass = true;
+				if (!first_pass) {
+					first_pass_tap = tap;
+					first_pass = true;
+				}
+
+			} else if (!tap_result && start_fail && start_pass &&
+				   !end_pass) {
+				end_pass_tap = tap - 1;
+				end_pass = true;
+			} else if (tap_result && start_pass && start_fail &&
+				   end_pass) {
+				window = end_pass_tap - start_pass_tap;
+				/* discard merged window and bubble window */
+				if (window >= thd_up || window < thd_low) {
+					start_pass_tap = tap;
+					end_pass = false;
+				} else {
+					/* set tap at middle of valid window */
+					tap = start_pass_tap + window / 2;
+					tegra_host->tuned_tap_delay = tap;
+					return;
+				}
+			}
+
+			bit++;
+		}
+	}
+
+	if (!first_fail) {
+		WARN_ON("no edge detected, continue with hw tuned delay.\n");
+	} else if (first_pass) {
+		/* set tap location at fixed tap relative to the first edge */
+		edge1 = first_fail_tap + (first_pass_tap - first_fail_tap) / 2;
+		if (edge1 - 1 > fixed_tap)
+			tegra_host->tuned_tap_delay = edge1 - fixed_tap;
+		else
+			tegra_host->tuned_tap_delay = edge1 + fixed_tap;
+	}
+}
+
+static void tegra_sdhci_post_tuning(struct sdhci_host *host)
+{
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+	struct sdhci_tegra *tegra_host = sdhci_pltfm_priv(pltfm_host);
+	const struct sdhci_tegra_soc_data *soc_data = tegra_host->soc_data;
+	u32 avg_tap_dly, val, min_tap_dly, max_tap_dly;
+	u8 fixed_tap, start_tap, end_tap, window_width;
+	u8 thdupper, thdlower;
+	u8 num_iter;
+	u32 clk_rate_mhz, period_ps, bestcase, worstcase;
+
+	/* retain HW tuned tap to use incase if no correction is needed */
+	val = sdhci_readl(host, SDHCI_TEGRA_VENDOR_CLOCK_CTRL);
+	tegra_host->tuned_tap_delay = (val & SDHCI_CLOCK_CTRL_TAP_MASK) >>
+				      SDHCI_CLOCK_CTRL_TAP_SHIFT;
+	if (soc_data->min_tap_delay && soc_data->max_tap_delay) {
+		min_tap_dly = soc_data->min_tap_delay;
+		max_tap_dly = soc_data->max_tap_delay;
+		clk_rate_mhz = tegra_host->curr_clk_rate / USEC_PER_SEC;
+		period_ps = USEC_PER_SEC / clk_rate_mhz;
+		bestcase = period_ps / min_tap_dly;
+		worstcase = period_ps / max_tap_dly;
+		/*
+		 * Upper and Lower bound thresholds used to detect merged and
+		 * bubble windows
+		 */
+		thdupper = (2 * worstcase + bestcase) / 2;
+		thdlower = worstcase / 4;
+		/*
+		 * fixed tap is used when HW tuning result contains single edge
+		 * and tap is set at fixed tap delay relative to the first edge
+		 */
+		avg_tap_dly = (period_ps * 2) / (min_tap_dly + max_tap_dly);
+		fixed_tap = avg_tap_dly / 2;
+
+		val = sdhci_readl(host, SDHCI_TEGRA_VNDR_TUN_STATUS1);
+		start_tap = val & SDHCI_TEGRA_VNDR_TUN_STATUS1_TAP_MASK;
+		end_tap = (val >> SDHCI_TEGRA_VNDR_TUN_STATUS1_END_TAP_SHIFT) &
+			  SDHCI_TEGRA_VNDR_TUN_STATUS1_TAP_MASK;
+		window_width = end_tap - start_tap;
+		num_iter = host->tuning_loop_count;
+		/*
+		 * partial window includes edges of the tuning range.
+		 * merged window includes more taps so window width is higher
+		 * than upper threshold.
+		 */
+		if (start_tap == 0 || (end_tap == (num_iter - 1)) ||
+		    (end_tap == num_iter - 2) || window_width >= thdupper) {
+			pr_debug("%s: Apply tuning correction\n",
+				 mmc_hostname(host->mmc));
+			tegra_sdhci_tap_correction(host, thdupper, thdlower,
+						   fixed_tap);
+		}
+	}
+
+	tegra_sdhci_set_tap(host, tegra_host->tuned_tap_delay);
+}
+
+static int tegra_sdhci_execute_hw_tuning(struct mmc_host *mmc, u32 opcode)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+	int err;
+
+	err = sdhci_execute_tuning(mmc, opcode);
+	if (!err && !host->tuning_err)
+		tegra_sdhci_post_tuning(host);
+
+	return err;
+}
+
 static void tegra_sdhci_set_uhs_signaling(struct sdhci_host *host,
 					  unsigned timing)
 {
@@ -778,17 +953,22 @@ static void tegra_sdhci_set_uhs_signaling(struct sdhci_host *host,
 	bool set_default_tap = false;
 	bool set_dqs_trim = false;
 	bool do_hs400_dll_cal = false;
+	u8 iter = TRIES_256;
+	u32 val;
 
 	tegra_host->ddr_signaling = false;
 	switch (timing) {
 	case MMC_TIMING_UHS_SDR50:
+		break;
 	case MMC_TIMING_UHS_SDR104:
 	case MMC_TIMING_MMC_HS200:
 		/* Don't set default tap on tunable modes. */
+		iter = TRIES_128;
 		break;
 	case MMC_TIMING_MMC_HS400:
 		set_dqs_trim = true;
 		do_hs400_dll_cal = true;
+		iter = TRIES_128;
 		break;
 	case MMC_TIMING_MMC_DDR52:
 	case MMC_TIMING_UHS_DDR50:
@@ -800,11 +980,25 @@ static void tegra_sdhci_set_uhs_signaling(struct sdhci_host *host,
 		break;
 	}
 
+	val = sdhci_readl(host, SDHCI_VNDR_TUN_CTRL0_0);
+	val &= ~(SDHCI_VNDR_TUN_CTRL0_TUN_ITER_MASK |
+		 SDHCI_VNDR_TUN_CTRL0_START_TAP_VAL_MASK |
+		 SDHCI_VNDR_TUN_CTRL0_MUL_M_MASK);
+	val |= (iter << SDHCI_VNDR_TUN_CTRL0_TUN_ITER_SHIFT |
+		0 << SDHCI_VNDR_TUN_CTRL0_START_TAP_VAL_SHIFT |
+		1 << SDHCI_VNDR_TUN_CTRL0_MUL_M_SHIFT);
+	sdhci_writel(host, val, SDHCI_VNDR_TUN_CTRL0_0);
+	sdhci_writel(host, 0, SDHCI_TEGRA_VNDR_TUN_CTRL1_0);
+
+	host->tuning_loop_count = (iter == TRIES_128) ? 128 : 256;
+
 	sdhci_set_uhs_signaling(host, timing);
 
 	tegra_sdhci_pad_autocalib(host);
 
-	if (set_default_tap)
+	if (tegra_host->tuned_tap_delay && !set_default_tap)
+		tegra_sdhci_set_tap(host, tegra_host->tuned_tap_delay);
+	else
 		tegra_sdhci_set_tap(host, tegra_host->default_tap);
 
 	if (set_dqs_trim)
@@ -1110,6 +1304,8 @@ static const struct sdhci_tegra_soc_data soc_data_tegra210 = {
 		    NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
 		    NVQUIRK_ENABLE_SDR50 |
 		    NVQUIRK_ENABLE_SDR104,
+	.min_tap_delay = 106,
+	.max_tap_delay = 185,
 };
 
 static const struct sdhci_ops tegra186_sdhci_ops = {
@@ -1150,9 +1346,23 @@ static const struct sdhci_tegra_soc_data soc_data_tegra186 = {
 		    NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
 		    NVQUIRK_ENABLE_SDR50 |
 		    NVQUIRK_ENABLE_SDR104,
+	.min_tap_delay = 84,
+	.max_tap_delay = 136,
+};
+
+static const struct sdhci_tegra_soc_data soc_data_tegra194 = {
+	.pdata = &sdhci_tegra186_pdata,
+	.nvquirks = NVQUIRK_NEEDS_PAD_CONTROL |
+		    NVQUIRK_HAS_PADCALIB |
+		    NVQUIRK_DIS_CARD_CLK_CONFIG_TAP |
+		    NVQUIRK_ENABLE_SDR50 |
+		    NVQUIRK_ENABLE_SDR104,
+	.min_tap_delay = 96,
+	.max_tap_delay = 139,
 };
 
 static const struct of_device_id sdhci_tegra_dt_match[] = {
+	{ .compatible = "nvidia,tegra194-sdhci", .data = &soc_data_tegra194 },
 	{ .compatible = "nvidia,tegra186-sdhci", .data = &soc_data_tegra186 },
 	{ .compatible = "nvidia,tegra210-sdhci", .data = &soc_data_tegra210 },
 	{ .compatible = "nvidia,tegra124-sdhci", .data = &soc_data_tegra124 },
@@ -1251,6 +1461,10 @@ static int sdhci_tegra_probe(struct platform_device *pdev)
 	host->mmc_host_ops.hs400_enhanced_strobe =
 			tegra_sdhci_hs400_enhanced_strobe;
 
+	if (!host->ops->platform_execute_tuning)
+		host->mmc_host_ops.execute_tuning =
+				tegra_sdhci_execute_hw_tuning;
+
 	rc = mmc_of_parse(host->mmc);
 	if (rc)
 		goto err_parse_dt;