Initial Commit of the PDM project (ready for DWS migration)

2026-04-20 08:42:38 -05:00
commit dda7b664e7
2721 changed files with 442772 additions and 0 deletions
--- a/migrate_folderdata.py
+++ b/migrate_folderdata.py
@@ -0,0 +1,734 @@
+"""
+Data Migration Script
+Migrates VariableValue data from source to target database with ID mapping.
+"""
+import json
+import logging
+import csv
+import os
+import glob
+from datetime import datetime
+from db_utils import DatabaseConnection
+
+
+class DataMigrator:
+    def __init__(self, config_path='config.json'):
+        """Initialize the migrator with configuration."""
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.source_conn = None  # this is the source DB connection (check config.json)
+        self.target_conn = None  # this is the target DB connection (check config.json)
+        self.project_map = {}  # this is where we store all the ProjectID maps between |source <-> target|
+        self.variable_map = {}  # this is where we store all the VariableID maps between |source <-> target|
+        self.project_mapping_details = []  # Store full mapping details for export
+        self.variable_mapping_details = []  # Store full mapping details for export
+
+        # Store database identifiers for progress tracking
+        self.source_db_name = self.config['source_db']['database']
+        self.target_db_name = self.config['target_db']['database']
+
+        # Setup logging and timestamp
+        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        log_filename = f"folderdata_migration_{self.timestamp}.log"
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            handlers=[
+                logging.FileHandler(log_filename),
+                logging.StreamHandler()
+            ]
+        )
+        self.logger = logging.getLogger(__name__)
+
+    def connect_databases(self):
+        """Establish connections to source and target databases."""
+        self.logger.info("Connecting to source database...")
+        self.source_conn = DatabaseConnection(self.config['source_db'])  # use the DB info in config.json to connect
+
+        self.logger.info("Connecting to target database...")
+        self.target_conn = DatabaseConnection(self.config['target_db'])  # use the DB info in config.json to connect
+
+        self.logger.info("Database connections established.")
+
+    def transform_source_path(self, source_path):
+        """Transform source path to expected target path by prepending root folder."""
+        if not source_path:
+            return None
+
+        # Get target root folder from config
+        # TODO this is defaulting to Citadel, need to change to make this more generic
+        target_root = self.config.get('path_mapping', {}).get('target_root_folder', 'Citadel')
+
+        # Strip leading backslash if exists
+        path_without_leading = source_path.lstrip('\\')
+
+        # Handle root path specially. The way the code is set up our Source Root will always be blank
+        if not path_without_leading or path_without_leading == '':
+            # Source root maps to target root
+            return f'\\{target_root}\\'
+
+        # Prepend target root folder
+        transformed = f'\\{target_root}\\{path_without_leading}'
+
+        # Ensure trailing backslash if original had it
+        if source_path.endswith('\\') and not transformed.endswith('\\'):
+            transformed += '\\'
+
+        return transformed
+
+    def build_project_mapping(self):
+        """Build mapping of source ProjectID to target ProjectID based on folder paths."""
+        """NOTE: If you try to match based on names you are going to get a lot of the same folder names"""
+        self.logger.info("Building project ID mapping using path-based matching...")
+
+        # Fetch projects from source with paths (exclude deleted projects)
+        source_projects = self.source_conn.execute_query(
+            "SELECT ProjectID, Name, Path FROM Projects WHERE Deleted = 0 OR Deleted IS NULL"
+        )
+
+        # Fetch projects from target with paths (exclude deleted projects)
+        target_projects = self.target_conn.execute_query(
+            "SELECT ProjectID, Name, Path FROM Projects WHERE Deleted = 0 OR Deleted IS NULL"
+        )
+
+        self.logger.info(f"Found {len(source_projects)} active projects in source database (deleted projects excluded)")
+        self.logger.info(f"Found {len(target_projects)} active projects in target database (deleted projects excluded)")
+
+        # Create path-to-ID mapping for target
+        # Handle case sensitivity based on config, we default to FALSE
+        case_sensitive = self.config.get('path_mapping', {}).get('case_sensitive', False)
+
+        target_path_map = {}
+        # in this loop we are going through all of the folders in the Target DB (PDM calls them projects) and
+        # creating the key:value pair [path]:[project_DB_row]
+        for project in target_projects:
+            path = project['Path']
+            if path:
+                # Normalize path for case-insensitive matching if needed
+                key = path if case_sensitive else path.lower()
+                target_path_map[key] = project
+
+        self.logger.info(f"Built target path index with {len(target_path_map)} paths")
+
+        # Build source-to-target ID mapping
+        mapped_count = 0
+        unmapped_count = 0
+        null_path_count = 0
+
+        for source_project in source_projects:
+            source_id = source_project['ProjectID']
+            project_name = source_project['Name']
+            source_path = source_project['Path']
+
+            # Skip projects with no path
+            if not source_path:
+                self.logger.warning(f"Project '{project_name}' (ID: {source_id}) has no path - skipping")
+                null_path_count += 1
+                continue
+
+            # Transform source path to expected target path
+            target_path = self.transform_source_path(source_path)
+
+            # This is more of a precaution, it really should never be triggered, I guess maybe this would show you there was an error w/root
+            if not target_path:
+                self.logger.warning(f"Could not transform path for project '{project_name}' (ID: {source_id})")
+                unmapped_count += 1
+                continue
+
+            # Here we handle case sensitivity if its required.
+            # target_path - this is actually our "expected target path" or basically our source + new root folder
+            # lookup_key - this is basically just a case sensitive version of target_path
+            lookup_key = target_path if case_sensitive else target_path.lower()
+
+            # Here we are taking all of the paths we are expecting to find (lookup_key) and seeing if we actually find them in the TargetDB
+            # the if clause only gets triggered on the matches
+            if lookup_key in target_path_map:
+                target_project = target_path_map[lookup_key]
+                target_id = target_project['ProjectID']
+
+                # if we find a match we map it to Global project_map in our DataMigrator class (created in __init__)
+                self.project_map[source_id] = target_id
+
+                # Store full details for export this is used for logging in the CSV so we can manually review the mappings
+                self.project_mapping_details.append({
+                    'ProjectName': project_name,
+                    'SourceID': source_id,
+                    'TargetID': target_id,
+                    'SourcePath': source_path,
+                    'TargetPath': target_path
+                })
+
+                self.logger.debug(f"Mapped Project '{project_name}': {source_id} -> {target_id} (Path: {source_path} -> {target_path})")
+                mapped_count += 1
+            else:
+                # Enhanced debugging for unmapped paths
+                self.logger.warning(f"Project '{project_name}' (ID: {source_id}) - Target path not found")
+                self.logger.warning(f"  Source path: [{source_path}]")
+                self.logger.warning(f"  Transformed to: [{target_path}]")
+                self.logger.warning(f"  Lookup key: [{lookup_key}]")
+
+                # Check for similar paths (useful for finding typos/whitespace issues)
+                similar = [p for p in target_path_map.keys() if project_name.lower() in p.lower()][:3]
+                if similar:
+                    self.logger.warning(f"  Similar paths in target: {similar}")
+
+                unmapped_count += 1
+
+        self.logger.info(f"Project mapping complete:")
+        self.logger.info(f"--- Successfully mapped: {mapped_count} projects")
+        self.logger.info(f"--- Unmapped (path not found): {unmapped_count} projects")
+        self.logger.info(f"--- Skipped (null path): {null_path_count} projects")
+        self.logger.info(f"--- Total in project_map: {len(self.project_map)} projects")
+
+    def build_variable_mapping(self):
+        """Build mapping of source VariableID to target VariableID based on variable names."""
+        """NOTE: The logic in this section is different than the Project Mapping"""
+        """NOTE: the reason for this is that there cannot be duplicate Variable names so we can match on Name"""
+        self.logger.info("Building variable ID mapping...")
+
+        # Fetch variables from source (exclude deleted variables)
+        source_variables = self.source_conn.execute_query(
+            "SELECT VariableID, VariableName FROM Variable WHERE IsDeleted = 0 OR IsDeleted IS NULL"
+        )
+
+        # Fetch variables from target (exclude deleted variables)
+        target_variables = self.target_conn.execute_query(
+            "SELECT VariableID, VariableName FROM Variable WHERE IsDeleted = 0 OR IsDeleted IS NULL"
+        )
+
+        # Filter out system variables (names in curly brackets like {GUID})
+        source_variables_before_filter = len(source_variables)
+        target_variables_before_filter = len(target_variables)
+
+        # The PDMVault has a two kinds of variables. All the user created variables show up with human readable names
+        # the system variables show up like Name = {ASDFASD-FASDFSDF-ADFDFDFS}
+        # so in these two lines, we are making sure we ignore the system variables by ignoring { }
+        source_variables = [v for v in source_variables if not (v['VariableName'].startswith('{') and v['VariableName'].endswith('}'))]
+        target_variables = [v for v in target_variables if not (v['VariableName'].startswith('{') and v['VariableName'].endswith('}'))]
+
+        # We are keeping track of how many system variables we are skipping
+        source_system_vars_excluded = source_variables_before_filter - len(source_variables)
+        target_system_vars_excluded = target_variables_before_filter - len(target_variables)
+
+        self.logger.info(f"Found {len(source_variables)} user variables in source database (deleted: excluded, system variables: {source_system_vars_excluded} excluded)")
+        self.logger.info(f"Found {len(target_variables)} user variables in target database (deleted: excluded, system variables: {target_system_vars_excluded} excluded)")
+
+        # Create name-to-ID mapping for target
+        target_map = {row['VariableName']: row['VariableID'] for row in target_variables}
+
+        # Build source-to-target ID mapping
+        for source_variable in source_variables:
+            source_id = source_variable['VariableID']
+            variable_name = source_variable['VariableName']
+
+            if variable_name in target_map:
+                target_id = target_map[variable_name]
+                self.variable_map[source_id] = target_id
+                # Store full details for export
+                self.variable_mapping_details.append({
+                    'VariableName': variable_name,
+                    'SourceID': source_id,
+                    'TargetID': target_id
+                })
+                self.logger.debug(f"Mapped Variable '{variable_name}': {source_id} -> {target_id}")
+            else:
+                self.logger.warning(f"Variable '{variable_name}' (ID: {source_id}) not found in target database")
+
+        self.logger.info(f"Variable mapping complete. Mapped {len(self.variable_map)} variables.")
+
+    def validate_mappings(self):
+        """Validate project and variable mappings for integrity."""
+        self.logger.info("Validating mappings...")
+
+        validation_issues = []
+
+        # Check for duplicate target IDs in project mapping (multiple sources -> same target)
+        # NOTE This was put inplace before we switched to matching via project path. It really shouldnt trigger
+        # NOTE but we are going to leave it in, in case it does. If it does trigger its a sign of having duplicate names
+        target_id_counts = {}
+
+        # here are making a dictionary where...
+        # key/value ..... [targetID]:[sourceID_list]
+        # basically if the sourceID_list is longer than one that shows that we are mapping multiple things to the same target
+        for source_id, target_id in self.project_map.items():
+            if target_id not in target_id_counts:
+                target_id_counts[target_id] = []
+            target_id_counts[target_id].append(source_id)
+
+        # basically if the sourceID_list is longer than one that shows that we are mapping multiple things to the same target
+        for target_id, source_ids in target_id_counts.items():
+            if len(source_ids) > 1:
+                issue = f"Multiple source projects ({source_ids}) map to same target project {target_id}"
+                validation_issues.append(issue)
+                self.logger.warning(issue)
+
+        # Check for duplicate target IDs in variable mapping
+        # NOTE: I can't think of a reason why any of this would ever trigger, but its in here
+        var_target_id_counts = {}
+        for source_id, target_id in self.variable_map.items():
+            if target_id not in var_target_id_counts:
+                var_target_id_counts[target_id] = []
+            var_target_id_counts[target_id].append(source_id)
+
+        for target_id, source_ids in var_target_id_counts.items():
+            if len(source_ids) > 1:
+                issue = f"Multiple source variables ({source_ids}) map to same target variable {target_id}"
+                validation_issues.append(issue)
+                self.logger.warning(issue)
+
+        # logging the issues 
+        if validation_issues:
+            self.logger.warning(f"Found {len(validation_issues)} validation issues")
+        else:
+            self.logger.info("All mappings validated successfully - no integrity issues found")
+
+        return validation_issues
+
+    def export_mappings_to_csv(self):
+        """Export ID mappings to CSV files for manual review."""
+        self.logger.info("Exporting mappings to CSV files...")
+
+        # Export project mappings
+        project_csv = f"mapping_projects_{self.timestamp}.csv"
+        with open(project_csv, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=['ProjectName', 'SourceID', 'TargetID', 'SourcePath', 'TargetPath'])
+            writer.writeheader()
+            writer.writerows(self.project_mapping_details)
+
+        self.logger.info(f"Project mappings exported to: {project_csv} ({len(self.project_mapping_details)} mappings)")
+
+        # Export variable mappings
+        variable_csv = f"mapping_variables_{self.timestamp}.csv"
+        with open(variable_csv, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=['VariableName', 'SourceID', 'TargetID'])
+            writer.writeheader()
+            writer.writerows(self.variable_mapping_details)
+
+        self.logger.info(f"Variable mappings exported to: {variable_csv} ({len(self.variable_mapping_details)} mappings)")
+
+    def save_migration_progress(self, batch_num, total_batches, stats):
+        """Save migration progress to resume on failure."""
+        """NOTE: Migration takes several hours and is subject to timeouts/connection issues. Need a way to resume gracefully"""
+        
+        # Create database-specific progress filename
+        # NOTE we create this file when the migration starts. Its specific to the DB's being migrated so you could conceivably
+        # NOTE go back and forth between migrations if you wanted
+        # NOTE when the migration is finished this file gets deleted
+        progress_file = f"migration_progress_{self.source_db_name}_to_{self.target_db_name}_{self.timestamp}.json"
+
+        # In config.json we specify how we are going to batch the migration and then as we migrate we keep stats of it
+        # essentially we commit changes every 10 batches so we are tracking that and rolling back to the nearest 10th if we have an error
+        progress_data = {
+            'timestamp': datetime.now().isoformat(),
+            'migration_timestamp': self.timestamp,
+            'source_database': self.source_db_name,
+            'target_database': self.target_db_name,
+            'last_completed_batch': batch_num,
+            'total_batches': total_batches,
+            'records_inserted': stats['inserted'],
+            'records_updated': stats['updated'],
+            'records_errors': stats['errors']
+        }
+        with open(progress_file, 'w') as f:
+            json.dump(progress_data, f, indent=2)
+        self.logger.info(f"Progress saved: batch {batch_num}/{total_batches}")
+
+    def load_migration_progress(self):
+        """Load previous migration progress if exists for current database pair."""
+        # Look for progress files matching current database combination
+        # for instance if you are trying to Migrate A->B and then it times out and you try doing A->C you wont get
+        # any errors, otherwise it would see a progress file and assume the progress on A->B was the progress of your A->C migration
+        progress_pattern = f"migration_progress_{self.source_db_name}_to_{self.target_db_name}_*.json"
+        progress_files = glob.glob(progress_pattern)
+
+        if not progress_files:
+            self.logger.info("No previous migration progress found for this database pair")
+            return None
+
+        # Get most recent progress file for this database pair
+        latest_progress = max(progress_files, key=os.path.getmtime)
+
+        with open(latest_progress, 'r') as f:
+            progress = json.load(f)
+
+        # Validate that progress file matches current databases
+        if (progress.get('source_database') != self.source_db_name or
+            progress.get('target_database') != self.target_db_name):
+            self.logger.warning(f"Found progress file but database names don't match. Ignoring.")
+            return None
+
+        self.logger.info(f"Found previous migration progress: {latest_progress}")
+        self.logger.info(f"  Source DB: {progress['source_database']} -> Target DB: {progress['target_database']}")
+        self.logger.info(f"  Last completed batch: {progress['last_completed_batch']}/{progress['total_batches']}")
+        self.logger.info(f"  Inserted so far: {progress['records_inserted']}")
+        self.logger.info(f"  Updated so far: {progress['records_updated']}")
+        self.logger.info(f"  Errors so far: {progress['records_errors']}")
+
+        return progress
+
+    def cleanup_progress_file(self):
+        """Remove progress file after successful completion for this database pair."""
+        """NOTE: This function is called from migrate_variable_values() """
+        """NOTE: This function is only called after the final DB commit """
+        progress_pattern = f"migration_progress_{self.source_db_name}_to_{self.target_db_name}_*.json"
+        progress_files = glob.glob(progress_pattern)  #TODO look more into glob, but its basically a path matching tool
+
+        for pf in progress_files:
+            os.remove(pf)
+            self.logger.info(f"Removed progress file: {pf}")
+
+    def migrate_variable_values(self):
+        """Migrate VariableValue records from source to target using ID mappings with UPSERT logic."""
+        """ NOTE: We are using Upsert logic which means if a Folder card already has data in the Target """
+        """ NOTE: This migration is going to revert it to matching whatever is in the Source. """
+        self.logger.info("Starting VariableValue migration (UPSERT mode)...")
+
+        # Get configuration. In config.json we set how we want to batch it
+        # NOTE: We are defaulting to 500 Batch sizes
+        batch_size = self.config.get('migration', {}).get('batch_size', 500)
+
+        self.logger.info(f"Migration settings: batch_size={batch_size}, mode=UPSERT (insert new, update existing)")
+
+        # Fetch all VariableValue records from source
+        # TODO: REMOVE DocumentID=1 if you want to try migrating all the variable values 
+        # TODO: Do that at your own risk
+        # TODO: Probably don't do that
+        # Only fetch records for folders (DocumentID = 1)
+        # ORDER BY ensures consistent ordering across runs for reliable resume
+        source_values = self.source_conn.execute_query(
+            "SELECT * FROM VariableValue WHERE DocumentID = 1 ORDER BY ProjectID, VariableID, RevisionNo"
+        )
+
+        self.logger.info(f"Found {len(source_values)} VariableValue records in source (DocumentID=1 only).")
+
+        # Prepare MERGE query for UPSERT operation
+        # MERGE will INSERT if not exists, UPDATE if exists
+        # NOTE: In this context the source is the staged data that we are about to insert. It has already been converted.
+        # NOTE: Basically we are saying We have a correctly formatted row we are about to insert called "SOURCE" does it already exist in our target DB
+        merge_query = """
+            MERGE INTO VariableValue AS target
+            USING (SELECT ? AS VariableID, ? AS DocumentID, ? AS ProjectID, ? AS RevisionNo,
+                          ? AS ConfigurationID, ? AS ValueText, ? AS ValueInt, ? AS ValueFloat,
+                          ? AS ValueDate, ? AS ValueCache, ? AS IsLongText) AS source
+            ON (target.VariableID = source.VariableID
+                AND target.ProjectID = source.ProjectID
+                AND target.DocumentID = source.DocumentID)
+            WHEN MATCHED THEN
+                UPDATE SET
+                    RevisionNo = source.RevisionNo,
+                    ConfigurationID = source.ConfigurationID,
+                    ValueText = source.ValueText,
+                    ValueInt = source.ValueInt,
+                    ValueFloat = source.ValueFloat,
+                    ValueDate = source.ValueDate,
+                    ValueCache = source.ValueCache,
+                    IsLongText = source.IsLongText
+            WHEN NOT MATCHED THEN
+                INSERT (VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID, ValueText, ValueInt, ValueFloat, ValueDate, ValueCache, IsLongText)
+                VALUES (source.VariableID, source.DocumentID, source.ProjectID, source.RevisionNo, source.ConfigurationID, source.ValueText, source.ValueInt, source.ValueFloat, source.ValueDate, source.ValueCache, source.IsLongText);
+        """
+
+        # Collect records to upsert, with detailed tracking
+        params_list = []
+        record_metadata = []  # Store metadata for logging
+        skipped_unmapped = 0
+        skipped_invalid_doc = 0
+
+        for record in source_values:
+            # Extract source IDs
+            source_variable_id = record.get('VariableID')
+            source_project_id = record.get('ProjectID')
+            source_docID = record.get('DocumentID')
+
+            # Skip if DocumentID is not 1 (safety check, should be filtered by query)
+            if source_docID != 1:
+                self.logger.warning(f"Skipping record: ProjectID {source_project_id} - VariableID {source_variable_id} DocumentID is not == 1")
+                skipped_invalid_doc += 1
+                continue
+
+            # Map to target IDs
+            target_variable_id = self.variable_map.get(source_variable_id)
+            target_project_id = self.project_map.get(source_project_id)
+
+            # Skip if mapping not found
+            if target_variable_id is None:
+                self.logger.debug(f"Skipping record: VariableID {source_variable_id} not mapped")
+                skipped_unmapped += 1
+                continue
+
+            if target_project_id is None:
+                self.logger.debug(f"Skipping record: ProjectID {source_project_id} not mapped")
+                skipped_unmapped += 1
+                continue
+
+            # Extract other fields
+            source_rev = record.get('RevisionNo')
+            source_configID = record.get('ConfigurationID')
+            source_valueText = record.get('ValueText')
+            source_valueInt = record.get('ValueInt')
+            source_valueFloat = record.get('ValueFloat')
+            source_valueDate = record.get('ValueDate')
+            source_valueCache = record.get('ValueCache')
+            source_islongtext = record.get('IsLongText')
+
+            # Prepare params for MERGE statement
+            # Order matches the SELECT in MERGE USING clause
+            params = (
+                target_variable_id,   # VariableID
+                source_docID,         # DocumentID
+                target_project_id,    # ProjectID
+                source_rev,           # RevisionNo
+                source_configID,      # ConfigurationID
+                source_valueText,     # ValueText
+                source_valueInt,      # ValueInt
+                source_valueFloat,    # ValueFloat
+                source_valueDate,     # ValueDate
+                source_valueCache,    # ValueCache
+                source_islongtext     # IsLongText
+            )
+            params_list.append(params)
+
+            # Store metadata for detailed logging if needed
+            record_metadata.append({
+                'target_variable_id': target_variable_id,
+                'target_project_id': target_project_id,
+                'source_variable_id': source_variable_id,
+                'source_project_id': source_project_id
+            })
+
+        self.logger.info(f"Prepared {len(params_list)} records for UPSERT")
+        self.logger.info(f"Skipped {skipped_unmapped} records (unmapped IDs)")
+        
+        if skipped_invalid_doc > 0:
+            self.logger.info(f"Skipped {skipped_invalid_doc} records (invalid DocumentID)")
+
+        # Check for existing progress and offer to resume
+        previous_progress = self.load_migration_progress()
+        start_batch = 0
+        total_stats = {'inserted': 0, 'updated': 0, 'errors': 0}
+
+        if previous_progress:
+            self.logger.info(f"*** Previous migration found ***")
+            self.logger.info(f"Last completed batch: {previous_progress['last_completed_batch']}/{previous_progress['total_batches']}")
+            self.logger.info(f"Records inserted: {previous_progress['records_inserted']}")
+            self.logger.info(f"Records updated: {previous_progress['records_updated']}")
+
+            # Try to prompt user, if fails (non-interactive), automatically resume
+            try:
+                print(f"\n*** Previous migration found ***")
+                print(f"Last completed batch: {previous_progress['last_completed_batch']}/{previous_progress['total_batches']}")
+                print(f"Records inserted: {previous_progress['records_inserted']}")
+                print(f"Records updated: {previous_progress['records_updated']}")
+                response = input(f"Resume from batch {previous_progress['last_completed_batch'] + 1}? (y/n): ").strip().lower()
+            except EOFError:
+                # Non-interactive mode - automatically resume
+                response = 'y'
+                self.logger.info("Running in non-interactive mode - automatically resuming")
+
+            if response == 'y':
+                start_batch = previous_progress['last_completed_batch']
+                total_stats['inserted'] = previous_progress['records_inserted']
+                total_stats['updated'] = previous_progress['records_updated']
+                total_stats['errors'] = previous_progress['records_errors']
+                self.logger.info(f"Resuming from batch {start_batch + 1}")
+            else:
+                self.logger.info("Starting fresh migration (previous progress will be overwritten)")
+                self.cleanup_progress_file()
+
+        # Get commit interval from config
+        commit_interval = self.config.get('migration', {}).get('commit_interval', 10)
+        self.logger.info(f"Commit interval: every {commit_interval} batches")
+
+        # Process in batches with periodic commits
+        total_batches = (len(params_list) + batch_size - 1) // batch_size
+
+        try:
+            for i in range(start_batch * batch_size, len(params_list), batch_size):
+                batch = params_list[i:i + batch_size]
+                batch_num = (i // batch_size) + 1
+
+                self.logger.info(f"Processing batch {batch_num}/{total_batches} ({len(batch)} records)...")
+
+                # Execute MERGE for each record in batch
+                batch_inserted = 0
+                batch_updated = 0
+                batch_errors = 0
+
+                for j, params in enumerate(batch):
+                    try:
+                        cursor = self.target_conn.connection.cursor()
+                        cursor.execute(merge_query, params)
+                        affected = cursor.rowcount
+                        cursor.close()
+
+                        # MERGE returns 1 for INSERT, 2 for UPDATE (1 match + 1 update)
+                        if affected == 1:
+                            batch_inserted += 1
+                        elif affected == 2:
+                            batch_updated += 1
+
+                    except Exception as e:
+                        meta = record_metadata[i + j]
+                        var_name = next((v['VariableName'] for v in self.variable_mapping_details if v['TargetID'] == meta['target_variable_id']), f"ID:{meta['target_variable_id']}")
+                        proj_name = next((p['ProjectName'] for p in self.project_mapping_details if p['TargetID'] == meta['target_project_id']), f"ID:{meta['target_project_id']}")
+
+                        self.logger.error(f"Failed to UPSERT - Variable: '{var_name}' (ID: {meta['target_variable_id']}), Project: '{proj_name}' (ID: {meta['target_project_id']}), Error: {e}")
+                        batch_errors += 1
+
+                total_stats['inserted'] += batch_inserted
+                total_stats['updated'] += batch_updated
+                total_stats['errors'] += batch_errors
+
+                self.logger.info(f"Batch {batch_num} complete: inserted={batch_inserted}, updated={batch_updated}, errors={batch_errors}")
+
+                # Commit every N batches to keep connection alive and save progress
+                if batch_num % commit_interval == 0:
+                    self.target_conn.commit()
+                    self.save_migration_progress(batch_num, total_batches, total_stats)
+                    self.logger.info(f"[COMMIT] Transaction committed at batch {batch_num} (every {commit_interval} batches)")
+
+            # Final commit for any remaining batches
+            self.target_conn.commit()
+            self.cleanup_progress_file()
+            self.logger.info("[SUCCESS] Final transaction committed successfully")
+            self.logger.info("[SUCCESS] Migration completed - progress file cleaned up")
+
+        except Exception as e:
+            self.logger.error(f"Migration failed at batch {batch_num}: {e}")
+            self.logger.error(f"Progress has been saved. Run the migration again to resume from batch {((batch_num // commit_interval) * commit_interval) + 1}")
+            # Don't rollback - we want to keep what was already committed
+            raise
+
+        self.logger.info(f"Migration complete:")
+        self.logger.info(f"  - Total inserted (new): {total_stats['inserted']}")
+        self.logger.info(f"  - Total updated (existing): {total_stats['updated']}")
+        self.logger.info(f"  - Unmapped skipped: {skipped_unmapped}")
+        self.logger.info(f"  - Errors: {total_stats['errors']}")
+
+    def validate_migration(self):
+        # After the migration runs we will run this script to validate that everything was inserted correctly into the Target DB
+        self.logger.info("Running Validation on the Target to ensure we completed migration successfully")
+
+        # the primary key columns are VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID
+        source_values = self.source_conn.execute_query(
+            """SELECT VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID FROM VariableValue
+               WHERE DocumentID = 1
+               ORDER BY DocumentID, VariableID, RevisionNo"""
+        )
+
+        target_values = self.target_conn.execute_query(
+            """SELECT VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID FROM VariableValue
+               WHERE DocumentID = 1
+               ORDER BY DocumentID, VariableID, RevisionNo"""
+        )
+
+        # --------------------------------------
+        # Convert target_values to a set of tuples for fast lookup
+        # ---------------------------------------
+        self.logger.info("Building target record set for comparison...")
+        target_set = set()  # this is every row of the VariableValue table in the Target DB
+        for record in target_values:
+            key = (
+                record['VariableID'],
+                record['DocumentID'],
+                record['ProjectID'],
+                record['RevisionNo'],
+                record['ConfigurationID']
+            )
+            target_set.add(key)
+
+        error_list = []  # this is the container we are going to use to hold the errors we find
+        success_count = 0  # we will just tally the records we find
+        ignore_count = 0  # counter for rows that we didnt map because we couldnt find a projectID in the Target DB
+
+        # now we search the target for each row using the mapped values to make sure that it is in there, we log it if we can't find it
+        # --------------------------------------
+        # Create a CSV to log all the rows we think we are missing and begin the scan
+        # ---------------------------------------
+        doc_csv_filename = f'validation_missing_folderdata_{self.timestamp}.csv'
+
+        with open(doc_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(['Target_VariableID', 'DocumentID', 'Target_ProjectID', 'RevisionNo', 'ConfigurationID'])  # write header
+            self.logger.info(f"Prepared {len(source_values)} records. Beginning validation with Target DB")
+        
+            for record in source_values:
+                # Extract source IDs
+                source_variable_id = record.get('VariableID')
+                source_project_id = record.get('ProjectID')
+
+                # Map to target IDs
+                target_variable_id = self.variable_map.get(source_variable_id)
+                target_project_id = self.project_map.get(source_project_id)
+
+                # Extract other fields
+                source_rev = record.get('RevisionNo')
+                source_configID = record.get('ConfigurationID')
+                source_documentID = record.get("DocumentID")  # even though all of these should be 1 we are still going to pull the actual value
+
+                # Prepare params for MERGE statement
+                # DocumentID is always 1 for folders in the PDMVault but we are still going to pull the value directly instead of hard coding
+                params = (
+                    target_variable_id,   # mapped VariableID
+                    source_documentID,    # should be 1 but we are going to pull it anyway
+                    target_project_id,    # mapped ProjectID
+                    source_rev,           # RevisionNo
+                    source_configID,      # ConfigurationID
+                )
+
+                # Some projects didnt map into the Target DB these are usually things like '' or 'No Project'
+                # These projects wont have a target ID so we are going to ignore them
+                if not target_project_id or not target_variable_id:
+                    ignore_count += 1
+                else:
+                    if params in target_set:
+                        success_count += 1
+                    else:
+                        self.logger.warning(f"Failed to find match for {params} in Target")
+                        error_list.append(params)  # record which record is missing
+                        writer.writerow([params[0], params[1], params[2], params[3], params[4]])
+        
+        self.logger.info("=" * 50)
+        self.logger.info("$  Migration Validation Completed!")
+        self.logger.info("=" * 50)
+        self.logger.info(f"Gross Success rate: {(success_count / len(source_values)) * 100:.2f}%")
+        self.logger.info(f"Success rate w/o Ignored Files: {(success_count / (len(source_values) - ignore_count)) * 100:.2f}%")
+        self.logger.info(f"{success_count} of {len(source_values)} Rows were found")
+        self.logger.info(f"MISSING ROW COUNT:{len(error_list)} - See CSV output for details")
+        self.logger.info(f"We ignored a total of {ignore_count} rows. We couldn't map these to the TargetDB. Either bad Var or Proj ID")
+        self.logger.info("$" * 50)
+
+    def run(self):
+        """Execute the complete migration process."""
+        try:
+            self.logger.info("=" * 50)
+            self.logger.info("Starting Data Migration")
+            self.logger.info("=" * 50)
+
+            self.connect_databases()
+            self.build_project_mapping()
+            self.build_variable_mapping()
+            self.validate_mappings()
+            self.export_mappings_to_csv()
+            self.migrate_variable_values()
+            self.validate_migration()
+
+            self.logger.info("=" * 50)
+            self.logger.info("Migration completed successfully!")
+            self.logger.info("=" * 50)
+
+        except Exception as e:
+            self.logger.error(f"Migration failed: {e}", exc_info=True)
+            raise
+
+        finally:
+            if self.source_conn:
+                self.source_conn.close()
+            if self.target_conn:
+                self.target_conn.close()
+
+
+if __name__ == "__main__":
+    migrator = DataMigrator('config.json')
+    migrator.run()