Assignment 5 conclusion

Sakiruto · Sakiruto · commit 296ae4151385 · 2025-02-03T10:33:33.000+05:30
diff --git a/.env.example b/.env.example
@@ -0,0 +1,5 @@
+COIN_API_KEY = 
+OAUTH_CLIENT_ID=
+OAUTH_CLIENT_SECRET=
+NOTION_AUTH_URL=""
+INTERNAL_ACCESS_TOKEN = 
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,13 @@
+#Secret keys
+.env
+Keys/
+
+# virtual environments
+demo1/
+.cache
+.venv-source-faker
+.venv-source-google-drive
+.venv-source-notion
+
+# Autogenerated Keys
+token.json
diff --git a/authenticate.py b/authenticate.py
diff --git a/main.py b/main.py
@@ -0,0 +1,113 @@
+import os
+import json
+import airbyte as ab
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from pydantic import BaseModel, Field
+from datetime import datetime
+import pandas as pd
+
+
+# If modifying these scopes, delete the file token.json.
+SCOPES = ["https://www.googleapis.com/auth/drive"]
+
+
+def get_credentials():
+    """Authenticate user and return credentials."""
+    creds = None
+    # Check if token.json exists and load credentials
+    if os.path.exists("token.json"):
+        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
+
+    # If credentials are not valid, refresh or request new ones
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save new credentials
+        with open("token.json", "w") as token:
+            token.write(creds.to_json())
+    return creds
+
+
+def load_credentials_from_token():
+    """Extract client_id, client_secret, and refresh_token from token.json."""
+    if not os.path.exists("token.json"):
+        raise FileNotFoundError("token.json not found. Run authentication first.")
+
+    with open("token.json", "r") as token_file:
+        data = json.load(token_file)
+
+    return {
+        "auth_type": "Client",
+        "client_id": data.get("client_id"),
+        "client_secret": data.get("client_secret"),
+        "refresh_token": data.get("refresh_token"),
+    }
+
+
+def sync_google_drive():
+    """Use Airbyte to sync Google Drive folder."""
+    try:
+        credentials = load_credentials_from_token()
+        folder_url = "https://drive.google.com/drive/folders/10DrawuhFx85xmr8v8vVB6PPOTVWPu7nI"  # Extracted from folder URL
+
+        source = ab.get_source(
+            "source-google-drive",
+            install_if_missing=True,
+            config={
+                "folder_url": folder_url,
+                "credentials": credentials,
+                "streams": [
+                {
+                    "name": "Csv_data",
+                    "format": {
+                    "filetype": "csv"
+                    },
+                   "globs": ["**/*.csv"],
+                },
+                {
+                    "name":"Unstructured_data",
+                    "format":{
+                        "filetype":"unstructured"
+                    }
+                }
+
+                ]
+            }
+        )
+        source.check()
+
+        source.select_streams("Unstructured_data")  # Select all streams from the Google Drive source
+        read_result = source.read()  # Read the data from the selected streams
+        # documents_list = []
+
+        # Convert the read data into document objects 
+        for stream_name, cached_dataset in read_result.items():
+            print(f"Stream Name: {stream_name}") 
+            print(cached_dataset)  
+
+            df = cached_dataset.to_pandas()
+            print(df.to_string())  
+            
+            
+
+    except HttpError as error:
+        print(f"An error occurred: {error}")
+    except FileNotFoundError as e:
+        print(e)
+
+
+def main():
+    """Authenticate and sync Google Drive files using Airbyte."""
+    get_credentials()  # Ensure authentication
+    sync_google_drive()  # Sync with Airbyte
+
+
+if __name__ == "__main__":
+    main()
diff --git a/notion.py b/notion.py
@@ -0,0 +1,19 @@
+import airbyte as ab
+import os
+from notion_client import Client
+
+notion = Client(auth=os.environ["INTERNAL_ACCESS_TOKEN"])
+
+token = ab.get_secret("INTERNAL_ACCESS_TOKEN")
+
+source = ab.get_source(
+    'source-notion',
+    config = {
+        "credentials":{
+            "auth_type":"token",
+            "token" : token
+        }
+    }
+)
+
+source.check()
diff --git a/requirement.txt b/requirement.txt
@@ -0,0 +1 @@
+airbyte
diff --git a/test.py b/test.py
@@ -0,0 +1,14 @@
+import airbyte as ab
+from airbyte.caches import BigQueryCache
+import pandas
+source: ab.Source = ab.get_source("source-faker")
+
+# Configure the source
+source.set_config(
+    config={
+        "count": 5000,  # Adjust this to get a larger or smaller dataset
+        "seed": 10,
+    },
+)
+# Verify the config and creds by running `check`:
+source.check()
diff --git a/test2.py b/test2.py
@@ -0,0 +1,60 @@
+import airbyte as ab
+import pandas as pd
+import matplotlib.pyplot as plt
+# Create and configure the source connector:
+source = ab.get_source(
+    "source-coin-api",
+    config={
+        "api_key": ab.get_secret("COIN_API_KEY"),
+        "environment": "production",
+        "symbol_id": "COINBASE_SPOT_INDEX_USD",
+        "period": "1DAY",
+        "start_date": "2023-01-01T00:00:00",
+    },
+    streams="*",
+)
+
+# Verify the config and creds by running `check`:
+source.check()
+
+# Read data from the source into the default cache:
+cache = ab.get_default_cache()
+result = source.read(cache=cache)
+
+# Read from the cache into a pandas Dataframe:
+ohlcv_df = cache["ohlcv_historical_data"].to_pandas()
+
+# Convert 'time_period_start' to datetime format and necessary columns to numeric
+ohlcv_df["time_period_start"] = pd.to_datetime(ohlcv_df["time_period_start"])
+numeric_columns = [
+    "price_open",
+    "price_high",
+    "price_low",
+    "price_close",
+    "volume_traded",
+    "trades_count",
+]
+ohlcv_df[numeric_columns] = ohlcv_df[numeric_columns].apply(
+    pd.to_numeric, errors="coerce"
+)
+
+# Calculate daily price movement
+ohlcv_df["daily_movement"] = ohlcv_df["price_close"] - ohlcv_df["price_open"]
+
+# Set the 'time_period_start' column as the index for plotting
+ohlcv_df.set_index("time_period_start", inplace=True)
+
+# Plotting the daily movement
+plt.figure(figsize=(12, 6))  # Set the figure size
+plt.plot(ohlcv_df["daily_movement"], marker="o", linestyle="-")
+plt.title("Daily Price Movement")
+plt.xlabel("Date")
+plt.ylabel("Price Movement")
+plt.grid(True)
+plt.xticks(rotation=45)  # Rotates the date labels for better readability
+plt.tight_layout()  # Adjusts the plot to ensure everything fits without overlapping
+
+# Save the plot as an image file
+plt.savefig("daily_price_movement.png")
+print("Plot saved as 'daily_price_movement.png'")
+