diff --git a/src/service_api/main.py b/src/service_api/main.py index 78bc407bdb82b9e581ca56b16bafe0b9c763f634..65099caa674bc698e3575cb314c11263cc918d07 100644 --- a/src/service_api/main.py +++ b/src/service_api/main.py @@ -15,6 +15,7 @@ from src.structures import TrafficStats logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) T = TypeVar("T") @@ -41,24 +42,25 @@ async def _query_instant( return None, None except (KeyError, ValueError) as parse_error: - logger.error("Error parsing response data:", exc_info=True) + logger.error("Error parsing response data:", exc_info=parse_error) raise HTTPException( status_code=status.HTTP_502_BAD_GATEWAY, detail=f"Error parsing Prometheus data: {parse_error}" - ) from parse_error + ) from None except httpx.RequestError as request_error: - logger.error("HTTP request error:", exc_info=True) + logger.error("HTTP request error:", exc_info=request_error) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"HTTP request error: {request_error}" - ) from request_error + ) from None except httpx.HTTPStatusError as status_error: logger.error( - f"HTTP status error: {status_error.response.status_code} - {status_error.response.text}", exc_info=True + f"HTTP status error: {status_error.response.status_code} - {status_error.response.text}", + exc_info=status_error, ) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"HTTP status error: {status_error}" - ) from status_error + ) from None async def _query_series(client: httpx.AsyncClient, cfg: PromConfig, query: str) -> list[dict[str, Any]]: @@ -73,24 +75,25 @@ async def _query_series(client: httpx.AsyncClient, cfg: PromConfig, query: str) return [] except (AttributeError, KeyError, ValueError) as parse_error: - logger.error("Error parsing series response data:", exc_info=True) + logger.error("Error parsing series response data:", exc_info=parse_error) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Error parsing Prometheus series data." - ) from parse_error + ) from None except httpx.RequestError as request_error: - logger.error(f"HTTP request error: {request_error}", exc_info=True) + logger.error(f"HTTP request error: {request_error}", exc_info=request_error) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"HTTP request error: {request_error}" - ) from request_error + ) from None except httpx.HTTPStatusError as status_error: logger.error( - f"HTTP status error: {status_error.response.status_code} - {status_error.response.text}", exc_info=True + f"HTTP status error: {status_error.response.status_code} - {status_error.response.text}", + exc_info=status_error, ) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"HTTP status error: {status_error}" - ) from status_error + ) from None async def _get_pod_inbound_traffic_bps( @@ -149,9 +152,9 @@ async def _get_pod_traffic_per_link_bytes_per_second( return traffic_bytes - except Exception as default_error: - logger.error("Error parsing response data:", exc_info=True) - raise HTTPException(status_code=500, detail="Error parsing data.") from default_error + except Exception as ex: + logger.error("Error parsing response data:", exc_info=ex) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Error parsing data.") from None async def _get_pod_inbound_traffic_rate( @@ -198,12 +201,12 @@ async def _get_outbound_traffic_rate_by_status_code( return traffic_res_rate_by_code - except (AttributeError, KeyError, ValueError) as parse_error: - logger.error(f"Error parsing series response data: {parse_error}") + except (AttributeError, KeyError, ValueError) as ex: + logger.error("Error parsing series response data:", exc_info=ex) raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, - detail=f"Error parsing Prometheus series data: {parse_error}", - ) from parse_error + detail="Error parsing Prometheus series data.", + ) from None async def _get_pod_outbound_traffic_latency( @@ -228,10 +231,10 @@ async def _get_pod_outbound_traffic_latency( return results except Exception as ex: - logger.error(f"Request error while fetching pod traffic response stats:\n{ex}") + logger.error("Request error while fetching pod traffic response stats.", exc_info=ex) raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"Request error while fetching data: {ex}" - ) from ex + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Request error while fetching data" + ) from None app = FastAPI() @@ -290,14 +293,14 @@ async def get_workloads_list(): return workloads -async def get_pod_by_name_or_uid(client: httpx.AsyncClient, pod_name_or_uid: str, cfg: PromConfig) -> dict: +async def get_pod_by_name_or_uid(client: httpx.AsyncClient, pod_name_or_uid: str, cfg: PromConfig) -> dict[str, Any]: try: uid = UUID(pod_name_or_uid) query = f'kube_pod_labels{{uid="{uid}"}}' pod, _ = await _query_instant(client, cfg, query) if pod is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain pod info.") return pod @@ -310,7 +313,7 @@ async def get_pod_by_name_or_uid(client: httpx.AsyncClient, pod_name_or_uid: str pod, _ = await _query_instant(client, cfg, query) if pod is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain pod info.") return pod @@ -332,37 +335,42 @@ async def get_pod_details(pod_name_or_uid: str) -> Optional[PodInfoExtended]: _, value = await _query_instant(client, cfg, query) return int(value) - async def get_num_replicas(client, pod): + async def get_num_replicas(client, pod) -> Optional[int]: # Step 1: Get the ReplicaSet owner of the pod query = f'kube_pod_owner{{namespace="{pod["namespace"]}", pod="{pod["pod"]}"}}' metric, _ = await _query_instant(client, cfg, query) if metric is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + # raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain pod owner") + return None replicaset_name = metric.get("owner_name") if replicaset_name is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + # raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain pod owner") + return None # Step 2: Get the Deployment owner of the ReplicaSet query = f'kube_replicaset_owner{{namespace="{pod["namespace"]}", replicaset="{replicaset_name}"}}' metric, _ = await _query_instant(client, cfg, query) if metric is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + # raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain replica owner") + return None deployment_name = metric.get("owner_name") if deployment_name is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + # raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="Unable to obtain replica owner") + return None # Step 3: Figure out how many deployments are there query = f'kube_deployment_spec_replicas{{namespace="{pod["namespace"]}", deployment="{deployment_name}"}}' _, value = await _query_instant(client, cfg, query) if value is None: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + # raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY) + return None return int(value) @@ -370,8 +378,21 @@ async def get_pod_details(pod_name_or_uid: str) -> Optional[PodInfoExtended]: pod = await get_pod_by_name_or_uid(client, pod_name_or_uid, cfg) uid = pod.get("uid") + + if uid is None: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Invalid UID '{uid}'", + ) + pod_name = pod.get("pod") + if pod_name is None: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Could not obtain pod name '{uid}' -> '{pod_name}'", + ) + # obtain last valid state pod["current_state"] = get_current_state(client, uid) @@ -407,4 +428,7 @@ async def get_traffic_stats(pod_name_or_uid: str): pod = await get_pod_by_name_or_uid(client, pod_name_or_uid, cfg) results = await _get_pod_traffic_per_link_bytes_per_second(client, pod["pod"], timestamp, cfg) + if not results: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Traffic data not available.") + return results