pdm/migrate_filedata.py

"""
File Data Migration Script

Migrates VariableValue data for files (not projects) between SQL Server databases.
Maps DocumentIDs using the Documents table and Filename as the unique identifier.

Key differences from migrate.py:
- All files have ProjectID=2 in both databases
- Maps DocumentID instead of ProjectID
- Uses Filename from Documents table for mapping
- Filters for file-based records (DocumentID != 1)
"""

import json
import logging
import csv
import os
import glob
from datetime import datetime
from db_utils import DatabaseConnection


class FileDataMigration:
    """Handles migration of file-based VariableValue data between databases."""

    def __init__(self, config_file='config.json'):
        """Initialize migration with configuration file."""
        # Load configuration
        with open(config_file, 'r') as f:
            self.config = json.load(f)

        # Store database identifiers for progress tracking
        self.source_db_name = self.config['source_db']['database']
        self.target_db_name = self.config['target_db']['database']

        # Setup logging with timestamp
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_filename = f'filedata_migration_{self.timestamp}.log'

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_filename),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

        # Initialize mappings
        self.variable_map = {}
        self.document_map = {}
        self.configuration_map = {}
        self.state_map = {}

        # Database connections
        self.source_conn = DatabaseConnection(self.config['source_db'])
        self.target_conn = DatabaseConnection(self.config['target_db'])

        self.logger.info("Database connections established.")

    def build_variable_mapping(self):
        """Build mapping of source VariableID to target VariableID based on variable names."""
        # Pulls all non-deleted, non-system variables from both the source and target databases,
        # then matches them by name. This produces a dict that translates source VariableIDs to
        # their corresponding target VariableIDs so migrated records point to the correct variable
        # in the new vault. Any source variables without a name match in the target are logged as unmapped.
        self.logger.info("Building variable ID mapping...")

        # Fetch variables from source (exclude deleted and system variables)
        source_variables = self.source_conn.execute_query(
            "SELECT VariableID, VariableName FROM Variable WHERE IsDeleted = 0 OR IsDeleted IS NULL"
        )

        # Fetch variables from target (exclude deleted and system variables)
        target_variables = self.target_conn.execute_query(
            "SELECT VariableID, VariableName FROM Variable WHERE IsDeleted = 0 OR IsDeleted IS NULL"
        )

        # Filter out system variables (names in curly brackets)
        source_user_vars = [v for v in source_variables if not (v['VariableName'].startswith('{') and v['VariableName'].endswith('}'))]
        target_user_vars = [v for v in target_variables if not (v['VariableName'].startswith('{') and v['VariableName'].endswith('}'))]

        self.logger.info(f"Found {len(source_user_vars)} user variables in source database")
        self.logger.info(f"Found {len(target_user_vars)} user variables in target database")

        # Create name-to-ID mapping for target
        target_var_map = {v['VariableName']: v['VariableID'] for v in target_user_vars}

        # Build source-to-target mapping
        mapped_count = 0
        unmapped_count = 0

        for source_var in source_user_vars:
            source_id = source_var['VariableID']
            var_name = source_var['VariableName']

            if var_name in target_var_map:
                target_id = target_var_map[var_name]
                self.variable_map[source_id] = target_id
                mapped_count += 1
            else:
                self.logger.warning(f"Variable '{var_name}' (ID: {source_id}) not found in target database")
                unmapped_count += 1

        self.logger.info(f"Variable mapping complete. Mapped {mapped_count} variables.")
        if unmapped_count > 0:
            self.logger.warning(f"Unmapped variables: {unmapped_count}")

    def transform_source_path(self, source_path):
        """Transform source path to expected target path by prepending root folder."""
        if not source_path:
            return None

        # Get target root folder from config
        target_root = self.config.get('path_mapping', {}).get('target_root_folder', 'Citadel')

        # Strip leading backslash if exists
        path_without_leading = source_path.lstrip('\\')

        # Prepend target root folder
        transformed = f'\\{target_root}\\{path_without_leading}'

        # Ensure trailing backslash if original had it
        if source_path.endswith('\\') and not transformed.endswith('\\'):
            transformed += '\\'

        return transformed

    def build_document_mapping(self):
        """Build mapping of source DocumentID to target DocumentID based on full vault path."""
        self.logger.info("Building document ID mapping using full vault path (folder path + filename)...")

        # Fetch documents with full paths from source
        # Join Documents -> DocumentsInProjects -> Projects to get folder path
        source_documents = self.source_conn.execute_query("""
            SELECT
                d.DocumentID,
                d.Filename,
                p.Path AS FolderPath,
                p.Path + d.Filename AS FullVaultPath
            FROM Documents d
            INNER JOIN DocumentsInProjects dp ON d.DocumentID = dp.DocumentID
            INNER JOIN Projects p ON dp.ProjectID = p.ProjectID
        """)

        # Fetch documents with full paths from target
        # Filter out deleted documents (Deleted=0) to avoid mapping to stale records
        # when a document has been added and deleted multiple times
        target_documents = self.target_conn.execute_query("""
            SELECT
                d.DocumentID,
                d.Filename,
                p.Path AS FolderPath,
                p.Path + d.Filename AS FullVaultPath
            FROM Documents d
            INNER JOIN DocumentsInProjects dp ON d.DocumentID = dp.DocumentID
            INNER JOIN Projects p ON dp.ProjectID = p.ProjectID
            WHERE d.Deleted = 0
        """)

        self.logger.info(f"Found {len(source_documents)} documents in source database")
        self.logger.info(f"Found {len(target_documents)} documents in target database")

        # Create full-path-to-ID mapping for target
        # Use case-insensitive comparison
        case_sensitive = self.config.get('path_mapping', {}).get('case_sensitive', False)

        target_doc_map = {}
        duplicate_count = 0
        for doc in target_documents:
            full_path = doc['FullVaultPath']
            if full_path:
                # Normalize path for case-insensitive matching if needed
                key = full_path if case_sensitive else full_path.lower()
                if key in target_doc_map:
                    duplicate_count += 1
                    # Keep the last occurrence
                target_doc_map[key] = doc

        self.logger.info(f"Built target document index with {len(target_doc_map)} unique paths")
        if duplicate_count > 0:
            self.logger.warning(f"Found {duplicate_count} duplicate paths in target (kept last occurrence)")

        # Build source-to-target mapping
        mapped_count = 0
        unmapped_count = 0
        null_path_count = 0

        for source_doc in source_documents:
            source_id = source_doc['DocumentID']
            filename = source_doc['Filename']
            folder_path = source_doc['FolderPath']
            source_full_path = source_doc['FullVaultPath']

            # Skip documents with no path
            if not source_full_path or not folder_path:
                self.logger.debug(f"Document '{filename}' (ID: {source_id}) has no path - skipping")
                null_path_count += 1
                continue

            # Transform source folder path to expected target folder path
            target_folder_path = self.transform_source_path(folder_path)

            if not target_folder_path:
                self.logger.debug(f"Document '{filename}' (ID: {source_id}) - path transformation failed")
                unmapped_count += 1
                continue

            # Construct expected target full path
            # Folder path already ends with '\', so just concatenate filename
            target_full_path = target_folder_path + filename

            # Look up in target using case-insensitive comparison
            key = target_full_path if case_sensitive else target_full_path.lower()

            if key in target_doc_map:
                target_doc = target_doc_map[key]
                target_id = target_doc['DocumentID']
                self.document_map[source_id] = target_id
                mapped_count += 1
            else:
                self.logger.debug(f"Document '{filename}' (ID: {source_id}) - Target path not found")
                self.logger.debug(f"  Source path: [{source_full_path}]")
                self.logger.debug(f"  Transformed to: [{target_full_path}]")
                unmapped_count += 1

        self.logger.info(f"Document mapping complete:")
        self.logger.info(f"  - Successfully mapped: {mapped_count} documents")
        self.logger.info(f"  - Unmapped (path not found): {unmapped_count} documents")
        self.logger.info(f"  - Skipped (null path): {null_path_count} documents")
        self.logger.info(f"  - Total in document_map: {len(self.document_map)} documents")

    def build_configuration_mapping(self):
        """Build mapping of source ConfigurationID to target ConfigurationID based on ConfigurationName."""
        """ NOTE: find the manual overrides by searching for all the instances where ConfigurationName has dupe in the DB"""
        self.logger.info("Building configuration ID mapping...")

        # Fetch configurations from source
        source_configs = self.source_conn.execute_query(
            "SELECT ConfigurationID, ConfigurationName FROM DocumentConfiguration"
        )

        # Fetch configurations from target
        target_configs = self.target_conn.execute_query(
            "SELECT ConfigurationID, ConfigurationName FROM DocumentConfiguration"
        )

        self.logger.info(f"Found {len(source_configs)} configurations in source database")
        self.logger.info(f"Found {len(target_configs)} configurations in target database")

        # Create ConfigurationName-to-ID mapping for target
        target_config_map = {}
        target_duplicates = {}

        for config in target_configs:
            config_name = config['ConfigurationName']
            config_id = config['ConfigurationID']

            if config_name in target_config_map:
                # Track duplicates
                if config_name not in target_duplicates:
                    target_duplicates[config_name] = [target_config_map[config_name]]
                target_duplicates[config_name].append(config_id)
            else:
                target_config_map[config_name] = config_id

        if target_duplicates:
            self.logger.warning(f"Found {len(target_duplicates)} duplicate ConfigurationNames in target:")
            for name, ids in target_duplicates.items():
                self.logger.warning(f"  '{name}': IDs {ids}")

        # Load manual overrides from config file
        manual_overrides = self.config.get('configuration_mapping_overrides', {})
        if manual_overrides:
            self.logger.warning("=" * 70)
            self.logger.warning("MANUAL CONFIGURATION MAPPING OVERRIDES DETECTED!")
            self.logger.warning(f"Found {len(manual_overrides)} manual configuration mapping overrides in config.json")
            self.logger.warning("=" * 70)
            self.logger.warning("Please verify these mappings are correct before proceeding:")

            # Convert string keys to int (JSON keys are always strings)
            manual_overrides = {int(k): int(v) for k, v in manual_overrides.items()}

            # Display the manual overrides
            for source_id, target_id in sorted(manual_overrides.items()):
                self.logger.warning(f"  Source ConfigurationID {source_id} -> Target ConfigurationID {target_id}")

            self.logger.warning("=" * 70)

            # Prompt user for confirmation
            try:
                response = input("\nHave you verified these configuration mappings are correct? (yes/no): ").strip().lower()
                if response != 'yes':
                    self.logger.error("Migration cancelled by user - please verify configuration mappings in config.json")
                    raise ValueError("User cancelled migration - configuration mappings not verified")
            except EOFError:
                # Non-interactive mode - log warning but continue
                self.logger.warning("Running in non-interactive mode - cannot prompt for confirmation")
                self.logger.warning("PROCEEDING WITH MANUAL CONFIGURATION OVERRIDES - ENSURE THESE ARE CORRECT!")

            self.logger.info(f"Proceeding with {len(manual_overrides)} manual configuration mapping overrides")

        # Build source-to-target ID mapping
        mapped_count = 0
        unmapped_count = 0
        override_count = 0

        for source_config in source_configs:
            source_id = source_config['ConfigurationID']
            config_name = source_config['ConfigurationName']

            # Check if there's a manual override first
            if source_id in manual_overrides:
                target_id = manual_overrides[source_id]
                self.configuration_map[source_id] = target_id
                self.logger.debug(f"Manual override: ConfigurationID {source_id} -> {target_id} ('{config_name}')")
                override_count += 1
                mapped_count += 1
            elif config_name in target_config_map:
                target_id = target_config_map[config_name]
                self.configuration_map[source_id] = target_id
                self.logger.debug(f"Mapped Configuration '{config_name}': {source_id} -> {target_id}")
                mapped_count += 1
            else:
                self.logger.warning(f"Configuration '{config_name}' (ID: {source_id}) not found in target database")
                unmapped_count += 1

        self.logger.info(f"Configuration mapping complete:")
        self.logger.info(f"  - Successfully mapped: {mapped_count} configurations")
        self.logger.info(f"  - Manual overrides applied: {override_count} configurations")
        self.logger.info(f"  - Unmapped: {unmapped_count} configurations")
        self.logger.info(f"  - Total in configuration_map: {len(self.configuration_map)} configurations")

    def build_state_mapping(self):
        """Build mapping of source CurrentStatusID to target CurrentStatusID based on explicit config.json mappings."""
        self.logger.info("Building state (CurrentStatusID) mapping...")

        # Load manual state mappings from config file
        # Unlike configuration mapping, state mapping ONLY uses explicit mappings from config
        # If no mapping exists, the target DB value is left unchanged
        manual_mappings = self.config.get('state_mapping_overrides', {})

        if not manual_mappings:
            self.logger.info("No state mapping overrides found in config.json")
            self.logger.info("CurrentStatusID values will remain unchanged in target database")
            return

        self.logger.warning("=" * 70)
        self.logger.warning("MANUAL STATE (CurrentStatusID) MAPPING OVERRIDES DETECTED!")
        self.logger.warning(f"Found {len(manual_mappings)} manual state mapping overrides in config.json")
        self.logger.warning("=" * 70)
        self.logger.warning("Please verify these mappings are correct before proceeding:")

        # Convert string keys to int (JSON keys are always strings)
        manual_mappings = {int(k): int(v) for k, v in manual_mappings.items()}

        # Display the manual mappings
        for source_id, target_id in sorted(manual_mappings.items()):
            self.logger.warning(f"  Source CurrentStatusID {source_id} -> Target CurrentStatusID {target_id}")

        self.logger.warning("=" * 70)

        # Prompt user for confirmation
        try:
            response = input("\nHave you verified these state mappings are correct? (yes/no): ").strip().lower()
            if response != 'yes':
                self.logger.error("Migration cancelled by user - please verify state mappings in config.json")
                raise ValueError("User cancelled migration - state mappings not verified")
        except EOFError:
            # Non-interactive mode - log warning but continue
            self.logger.warning("Running in non-interactive mode - cannot prompt for confirmation")
            self.logger.warning("PROCEEDING WITH MANUAL STATE OVERRIDES - ENSURE THESE ARE CORRECT!")

        # Store the mappings
        self.state_map = manual_mappings

        self.logger.info(f"State mapping complete:")
        self.logger.info(f"  - Total explicit mappings loaded: {len(self.state_map)} state mappings")
        self.logger.info("  - Any CurrentStatusID not in this mapping will remain unchanged in target")

    def export_mappings_to_csv(self):
        """Export variable and document mappings to CSV files for verification."""
        self.logger.info("Exporting mappings to CSV files...")

        # Export variable mappings
        var_csv_filename = f'mapping_variables_filedata_{self.timestamp}.csv'
        with open(var_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['VariableName', 'SourceID', 'TargetID'])

            # Get variable names for the mapping
            source_vars = self.source_conn.execute_query(
                "SELECT VariableID, VariableName FROM Variable"
            )
            source_var_names = {v['VariableID']: v['VariableName'] for v in source_vars}

            for source_id, target_id in self.variable_map.items():
                var_name = source_var_names.get(source_id, 'Unknown')
                writer.writerow([var_name, source_id, target_id])

        self.logger.info(f"Variable mappings exported to: {var_csv_filename} ({len(self.variable_map)} mappings)")

        # Export document mappings
        doc_csv_filename = f'mapping_documents_filedata_{self.timestamp}.csv'
        with open(doc_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Filename', 'SourceDocID', 'TargetDocID', 'SourcePath', 'TargetPath'])

            # Get filenames and full paths for the mapping
            source_docs = self.source_conn.execute_query("""
                SELECT
                    d.DocumentID,
                    d.Filename,
                    p.Path + d.Filename AS FullVaultPath
                FROM Documents d
                INNER JOIN DocumentsInProjects dp ON d.DocumentID = dp.DocumentID
                INNER JOIN Projects p ON dp.ProjectID = p.ProjectID
            """)
            source_doc_info = {d['DocumentID']: {'Filename': d['Filename'], 'FullVaultPath': d['FullVaultPath']} for d in source_docs}

            target_docs = self.target_conn.execute_query("""
                SELECT
                    d.DocumentID,
                    d.Filename,
                    p.Path + d.Filename AS FullVaultPath
                FROM Documents d
                INNER JOIN DocumentsInProjects dp ON d.DocumentID = dp.DocumentID
                INNER JOIN Projects p ON dp.ProjectID = p.ProjectID
            """)
            target_doc_info = {d['DocumentID']: {'Filename': d['Filename'], 'FullVaultPath': d['FullVaultPath']} for d in target_docs}

            for source_id, target_id in self.document_map.items():
                source_info = source_doc_info.get(source_id, {})
                target_info = target_doc_info.get(target_id, {})
                filename = source_info.get('Filename', 'Unknown')
                source_path = source_info.get('FullVaultPath', 'Unknown')
                target_path = target_info.get('FullVaultPath', 'Unknown')
                writer.writerow([filename, source_id, target_id, source_path, target_path])

        self.logger.info(f"Document mappings exported to: {doc_csv_filename} ({len(self.document_map)} mappings)")

    def save_migration_progress(self, batch_num, total_batches, stats):
        """Save migration progress to resume on failure."""
        progress_file = f"filedata_migration_progress_{self.source_db_name}_to_{self.target_db_name}_{self.timestamp}.json"

        progress_data = {
            'timestamp': datetime.now().isoformat(),
            'migration_timestamp': self.timestamp,
            'source_database': self.source_db_name,
            'target_database': self.target_db_name,
            'last_completed_batch': batch_num,
            'total_batches': total_batches,
            'records_inserted': stats['inserted'],
            'records_updated': stats['updated'],
            'records_errors': stats['errors']
        }

        with open(progress_file, 'w') as f:
            json.dump(progress_data, f, indent=2)

        self.logger.info(f"Progress saved: batch {batch_num}/{total_batches}")

    def load_migration_progress(self):
        """Load previous migration progress if exists for current database pair."""
        progress_pattern = f"filedata_migration_progress_{self.source_db_name}_to_{self.target_db_name}_*.json"
        progress_files = glob.glob(progress_pattern)

        if not progress_files:
            self.logger.info("No previous file data migration progress found for this database pair")
            return None

        latest_progress = max(progress_files, key=os.path.getmtime)

        with open(latest_progress, 'r') as f:
            progress = json.load(f)

        if (progress.get('source_database') != self.source_db_name or
            progress.get('target_database') != self.target_db_name):
            self.logger.warning(f"Found progress file but database names don't match. Ignoring.")
            return None

        self.logger.info(f"Found previous file data migration progress: {latest_progress}")
        self.logger.info(f"  Source DB: {progress['source_database']} -> Target DB: {progress['target_database']}")
        self.logger.info(f"  Last completed batch: {progress['last_completed_batch']}/{progress['total_batches']}")
        self.logger.info(f"  Records inserted: {progress['records_inserted']}")
        self.logger.info(f"  Records updated: {progress['records_updated']}")

        return progress

    def cleanup_progress_file(self):
        """Remove progress file after successful completion for this database pair."""
        progress_pattern = f"filedata_migration_progress_{self.source_db_name}_to_{self.target_db_name}_*.json"
        progress_files = glob.glob(progress_pattern)

        for pf in progress_files:
            os.remove(pf)
            self.logger.info(f"Removed progress file: {pf}")

    def preview_migration(self, num_records=20):
        """
        Preview what the first batch of VariableValue migrations would look like.
        Shows the actual SQL parameters that will be inserted/updated.
        """
        self.logger.info("="*70)
        self.logger.info(f"MIGRATION PREVIEW - First {num_records} Records")
        self.logger.info("="*70)
        self.logger.info("Showing actual SQL MERGE parameters that will be executed")
        self.logger.info("="*70)

        # Fetch source VariableValue records (same filter as actual migration)
        source_values = self.source_conn.execute_query(
            f"""SELECT TOP {num_records} * FROM VariableValue
               WHERE ProjectID = 2 AND DocumentID != 1
               ORDER BY DocumentID, VariableID, RevisionNo"""
        )

        if not source_values:
            self.logger.info("No source records found to preview.")
            return

        self.logger.info(f"\nFound {len(source_values)} source records to preview\n")

        # Preview each record
        mapped_count = 0
        skipped_count = 0

        for i, record in enumerate(source_values, 1):
            # Extract source IDs
            source_variable_id = record.get('VariableID')
            source_document_id = record.get('DocumentID')

            # Map to target IDs
            target_variable_id = self.variable_map.get(source_variable_id)
            target_document_id = self.document_map.get(source_document_id)

            self.logger.info(f"Record #{i}:")
            self.logger.info(f"  SOURCE VariableValue Row:")
            self.logger.info(f"    VariableID    : {record.get('VariableID')}")
            self.logger.info(f"    DocumentID    : {record.get('DocumentID')}")
            self.logger.info(f"    ProjectID     : {record.get('ProjectID')}")
            self.logger.info(f"    RevisionNo    : {record.get('RevisionNo')}")
            self.logger.info(f"    ConfigurationID: {record.get('ConfigurationID')}")
            self.logger.info(f"    ValueText     : {record.get('ValueText')}")
            self.logger.info(f"    ValueInt      : {record.get('ValueInt')}")
            self.logger.info(f"    ValueFloat    : {record.get('ValueFloat')}")
            self.logger.info(f"    ValueDate     : {record.get('ValueDate')}")
            self.logger.info(f"    ValueCache    : {record.get('ValueCache')}")
            self.logger.info(f"    IsLongText    : {record.get('IsLongText')}")

            # Check if mapping exists
            if target_variable_id is None or target_document_id is None:
                self.logger.warning(f"  TARGET: WILL BE SKIPPED (unmapped)")
                if target_variable_id is None:
                    self.logger.warning(f"    - Source VariableID {source_variable_id} not mapped to target")
                if target_document_id is None:
                    self.logger.warning(f"    - Source DocumentID {source_document_id} not mapped to target")
                skipped_count += 1
            else:
                # Prepare params exactly as migration would (same logic as migrate_file_variable_values)
                params = (
                    target_variable_id,              # VariableID
                    target_document_id,              # DocumentID
                    2,                               # ProjectID (always 2 for files)
                    record.get('RevisionNo'),        # RevisionNo
                    record.get('ConfigurationID'),   # ConfigurationID
                    record.get('ValueText'),         # ValueText
                    record.get('ValueInt'),          # ValueInt
                    record.get('ValueFloat'),        # ValueFloat
                    record.get('ValueDate'),         # ValueDate
                    record.get('ValueCache'),        # ValueCache
                    record.get('IsLongText')         # IsLongText
                )

                self.logger.info(f"  TARGET MERGE Parameters (will be UPSERTED):")
                self.logger.info(f"    VariableID    : {params[0]} (mapped from source {source_variable_id})")
                self.logger.info(f"    DocumentID    : {params[1]} (mapped from source {source_document_id})")
                self.logger.info(f"    ProjectID     : {params[2]} (always 2 for files)")
                self.logger.info(f"    RevisionNo    : {params[3]}")
                self.logger.info(f"    ConfigurationID: {params[4]}")
                self.logger.info(f"    ValueText     : {params[5]}")
                self.logger.info(f"    ValueInt      : {params[6]}")
                self.logger.info(f"    ValueFloat    : {params[7]}")
                self.logger.info(f"    ValueDate     : {params[8]}")
                self.logger.info(f"    ValueCache    : {params[9]}")
                self.logger.info(f"    IsLongText    : {params[10]}")
                mapped_count += 1

            self.logger.info("")

        self.logger.info("="*70)
        self.logger.info(f"PREVIEW SUMMARY")
        self.logger.info("="*70)
        self.logger.info(f"Records that WILL be migrated: {mapped_count}")
        self.logger.info(f"Records that will be SKIPPED: {skipped_count}")
        self.logger.info("")

    def migrate_file_variable_values(self):
        """
        Migrate VariableValue records for files (ProjectID=2, DocumentID != 1).
        Uses UPSERT mode with periodic commits and resume capability.
        """
        self.logger.info("Starting file-based VariableValue migration (UPSERT mode)...")

        # Get batch settings
        batch_size = self.config.get('migration', {}).get('batch_size', 500)
        commit_interval = self.config.get('migration', {}).get('commit_interval', 10)

        self.logger.info(f"Migration settings: batch_size={batch_size}, mode=UPSERT (insert new, update existing)")

        # Fetch ONLY the latest revision for each VariableID+DocumentID+ConfigurationID combination
        # Filter: ProjectID=2 (files) and DocumentID != 1 (not project-level variables)
        # Uses window function to get only the highest RevisionNo for each variable per document per configuration
        source_values = self.source_conn.execute_query(
            """SELECT * FROM (
                   SELECT *,
                          ROW_NUMBER() OVER (PARTITION BY DocumentID, VariableID, ConfigurationID
                                            ORDER BY RevisionNo DESC) as rn
                   FROM VariableValue
                   WHERE ProjectID = 2 AND DocumentID != 1
               ) ranked
               WHERE rn = 1
               ORDER BY DocumentID, VariableID, ConfigurationID"""
        )

        self.logger.info(f"Found {len(source_values)} latest-revision VariableValue records in source (ProjectID=2, DocumentID!=1).")

        # Prepare MERGE query for UPSERT operation
        # Match on all primary key columns: VariableID + DocumentID + ProjectID + RevisionNo + ConfigurationID
        merge_query = """
            MERGE INTO VariableValue AS target
            USING (SELECT ? AS VariableID, ? AS DocumentID, ? AS ProjectID, ? AS RevisionNo,
                          ? AS ConfigurationID, ? AS ValueText, ? AS ValueInt, ? AS ValueFloat,
                          ? AS ValueDate, ? AS ValueCache, ? AS IsLongText) AS source
            ON (target.VariableID = source.VariableID
                AND target.DocumentID = source.DocumentID
                AND target.ProjectID = source.ProjectID
                AND target.RevisionNo = source.RevisionNo
                AND target.ConfigurationID = source.ConfigurationID)
            WHEN MATCHED THEN
                UPDATE SET
                    ValueText = source.ValueText,
                    ValueInt = source.ValueInt,
                    ValueFloat = source.ValueFloat,
                    ValueDate = source.ValueDate,
                    ValueCache = source.ValueCache,
                    IsLongText = source.IsLongText
            WHEN NOT MATCHED THEN
                INSERT (VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID, ValueText, ValueInt, ValueFloat, ValueDate, ValueCache, IsLongText)
                VALUES (source.VariableID, source.DocumentID, source.ProjectID, source.RevisionNo, source.ConfigurationID, source.ValueText, source.ValueInt, source.ValueFloat, source.ValueDate, source.ValueCache, source.IsLongText);
        """

        # Collect records to upsert
        params_list = []
        skipped_unmapped = 0

        for record in source_values:
            # Extract source IDs
            source_variable_id = record.get('VariableID')
            source_document_id = record.get('DocumentID')

            # Map to target IDs
            target_variable_id = self.variable_map.get(source_variable_id)
            target_document_id = self.document_map.get(source_document_id)

            # Skip if mapping not found
            if target_variable_id is None:
                self.logger.debug(f"Skipping record: VariableID {source_variable_id} not mapped")
                skipped_unmapped += 1
                continue

            if target_document_id is None:
                self.logger.debug(f"Skipping record: DocumentID {source_document_id} not mapped")
                skipped_unmapped += 1
                continue

            # Extract other fields
            source_rev = record.get('RevisionNo')  # Read but not used - we always insert as Rev 1
            source_configID = record.get('ConfigurationID')
            source_valueText = record.get('ValueText')
            source_valueInt = record.get('ValueInt')
            source_valueFloat = record.get('ValueFloat')
            source_valueDate = record.get('ValueDate')
            source_valueCache = record.get('ValueCache')
            source_islongtext = record.get('IsLongText')

            # Map ConfigurationID to target
            target_config_id = self.configuration_map.get(source_configID)

            # Skip if configuration mapping not found
            if target_config_id is None:
                self.logger.debug(f"Skipping record: ConfigurationID {source_configID} not mapped")
                skipped_unmapped += 1
                continue

            # Prepare params for MERGE statement
            # ProjectID is always 2 for files in target database
            # RevisionNo is always 1 (we only migrate latest revision as new Rev 1)
            params = (
                target_variable_id,   # VariableID
                target_document_id,   # DocumentID
                2,                    # ProjectID (always 2 for files)
                1,                    # RevisionNo (always 1 for migrated data)
                target_config_id,     # ConfigurationID (mapped)
                source_valueText,     # ValueText
                source_valueInt,      # ValueInt
                source_valueFloat,    # ValueFloat
                source_valueDate,     # ValueDate
                source_valueCache,    # ValueCache
                source_islongtext     # IsLongText
            )

            params_list.append(params)

        self.logger.info(f"Prepared {len(params_list)} records for UPSERT")
        self.logger.info(f"Skipped {skipped_unmapped} records (unmapped IDs)")

        # Check for previous progress
        previous_progress = self.load_migration_progress()

        # Calculate batches
        total_batches = (len(params_list) + batch_size - 1) // batch_size

        # Initialize statistics
        total_stats = {'inserted': 0, 'updated': 0, 'errors': 0}
        start_batch = 0

        # Handle resume
        if previous_progress:
            self.logger.info(f"*** Previous file data migration found ***")
            self.logger.info(f"Last completed batch: {previous_progress['last_completed_batch']}/{previous_progress['total_batches']}")
            self.logger.info(f"Records inserted: {previous_progress['records_inserted']}")
            self.logger.info(f"Records updated: {previous_progress['records_updated']}")

            try:
                print(f"\n*** Previous file data migration found ***")
                print(f"Last completed batch: {previous_progress['last_completed_batch']}/{previous_progress['total_batches']}")
                print(f"Records inserted: {previous_progress['records_inserted']}")
                print(f"Records updated: {previous_progress['records_updated']}")
                response = input(f"Resume from batch {previous_progress['last_completed_batch'] + 1}? (y/n): ").strip().lower()
            except EOFError:
                response = 'y'
                self.logger.info("Running in non-interactive mode - automatically resuming")

            if response == 'y':
                start_batch = previous_progress['last_completed_batch']
                total_stats['inserted'] = previous_progress['records_inserted']
                total_stats['updated'] = previous_progress['records_updated']
                total_stats['errors'] = previous_progress['records_errors']
                self.logger.info(f"Resuming from batch {start_batch + 1}")
            else:
                self.logger.info("Starting fresh migration (previous progress will be overwritten)")
                self.cleanup_progress_file()

        self.logger.info(f"Commit interval: every {commit_interval} batches")

        # Process in batches
        for batch_num in range(start_batch, total_batches):
            batch_start = batch_num * batch_size
            batch_end = min(batch_start + batch_size, len(params_list))
            batch_params = params_list[batch_start:batch_end]

            self.logger.info(f"Processing batch {batch_num + 1}/{total_batches} ({len(batch_params)} records)...")

            # Execute MERGE for this batch
            for params in batch_params:
                try:
                    cursor = self.target_conn.connection.cursor()
                    cursor.execute(merge_query, params)

                    # Check if record was inserted or updated
                    # (This is approximate - SQL Server doesn't easily report MERGE action)
                    if cursor.rowcount > 0:
                        total_stats['inserted'] += 1
                    else:
                        total_stats['updated'] += 1

                    cursor.close()

                except Exception as e:
                    total_stats['errors'] += 1
                    self.logger.error(f"Error upserting record: {e}")

            self.logger.info(f"Batch {batch_num + 1} complete: inserted={total_stats['inserted']}, updated={total_stats['updated']}, errors={total_stats['errors']}")

            # Commit every N batches to keep connection alive and save progress
            if (batch_num + 1) % commit_interval == 0:
                self.target_conn.commit()
                self.save_migration_progress(batch_num + 1, total_batches, total_stats)
                self.logger.info(f"[COMMIT] Transaction committed at batch {batch_num + 1} (every {commit_interval} batches)")

        # Final commit
        self.target_conn.commit()
        self.logger.info(f"[SUCCESS] Final transaction committed")

        # Clean up progress file on success
        self.cleanup_progress_file()

        self.logger.info("File-based VariableValue migration completed successfully!")
        self.logger.info(f"Total records inserted: {total_stats['inserted']}")
        self.logger.info(f"Total records updated: {total_stats['updated']}")
        self.logger.info(f"Total errors: {total_stats['errors']}")

    def validate_migration(self):
        # After the migration runs we will run this script to validate that everything was inserted correctly into the Target DB
        self.logger.info("Running Validation on the Target to ensure we completed migration successfully")

        # the primary key columns are VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID
        # IMPORTANT: We only migrated the LATEST revision from source, so only validate those
        source_values = self.source_conn.execute_query(
            """SELECT * FROM (
                   SELECT VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID,
                          ROW_NUMBER() OVER (PARTITION BY DocumentID, VariableID, ConfigurationID
                                            ORDER BY RevisionNo DESC) as rn
                   FROM VariableValue
                   WHERE ProjectID = 2 AND DocumentID != 1
               ) ranked
               WHERE rn = 1
               ORDER BY DocumentID, VariableID, ConfigurationID"""
        )

        # Target should have all records with RevisionNo = 1
        target_values = self.target_conn.execute_query(
            """SELECT VariableID, DocumentID, ProjectID, RevisionNo, ConfigurationID FROM VariableValue
               WHERE ProjectID = 2 AND DocumentID != 1 AND RevisionNo = 1
               ORDER BY DocumentID, VariableID, ConfigurationID"""
        )

        # --------------------------------------
        # Convert target_values to a set of tuples for fast lookup
        # ---------------------------------------
        self.logger.info("Building target record set for comparison...")
        target_set = set()  # this is every row of the VariableValue table in the Target DB
        for record in target_values:
            key = (
                record['VariableID'],
                record['DocumentID'],
                record['ProjectID'],
                record['RevisionNo'],
                record['ConfigurationID']
            )
            target_set.add(key)

        error_list = []  # this is the container we are going to use to hold the errors we find
        success_count = 0  # we will just tally the records we find
        ignore_count = 0  # counter for rows that we didnt map because we couldnt find a documentID or variableID in the Target DB

        # now we search the target for each row using the mapped values to make sure that it is in there, we log it if we can't find it
        # --------------------------------------
        # Create a CSV to log all the rows we think we are missing and begin the scan
        # ---------------------------------------
        doc_csv_filename = f'validation_missing_filedata_{self.timestamp}.csv'

        with open(doc_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Target_VariableID', 'Target_DocumentID', 'ProjectID', 'RevisionNo', 'ConfigurationID'])  # write header
            self.logger.info(f"Prepared {len(source_values)} records. Beginning validation with Target DB")

            for record in source_values:
                # Extract source IDs
                source_variable_id = record.get('VariableID')
                source_document_id = record.get('DocumentID')

                # Map to target IDs
                target_variable_id = self.variable_map.get(source_variable_id)
                target_document_id = self.document_map.get(source_document_id)

                # Extract other fields
                source_rev = record.get('RevisionNo')  # Read but not used - we always insert as Rev 1
                source_configID = record.get('ConfigurationID')
                source_projectID = record.get("ProjectID")  # even though all of these should be 2 we are still going to pull the actual value

                # Map ConfigurationID to target
                target_config_id = self.configuration_map.get(source_configID)

                # Prepare params for validation
                # ProjectID is always 2 for files in target database
                # RevisionNo is always 1 (we migrated latest revision as Rev 1)
                params = (
                    target_variable_id,   # mapped VariableID
                    target_document_id,   # mapped DocumentID
                    source_projectID,     # ProjectID (should be 2 for files)
                    1,                    # RevisionNo (always 1 in target)
                    target_config_id,     # ConfigurationID (mapped)
                )

                # Some documents, variables, or configurations didnt map into the Target DB
                # These wont have a target ID so we are going to ignore them
                if not target_document_id or not target_variable_id or not target_config_id:
                    ignore_count += 1
                else:
                    if params in target_set:
                        success_count += 1
                    else:
                        self.logger.warning(f"Failed to find match for {params} in Target")
                        error_list.append(params)  # record which record is missing
                        writer.writerow([params[0], params[1], params[2], params[3], params[4]])


        self.logger.info("=" * 50)
        self.logger.info("$  Migration Validation Completed!")
        self.logger.info("=" * 50)
        self.logger.info(f"Gross Success rate: {(success_count / len(source_values)) * 100:.2f}%")
        self.logger.info(f"Success rate w/o Ignored Files: {(success_count / (len(source_values) - ignore_count)) * 100:.2f}%")
        self.logger.info(f"{success_count} of {len(source_values)} Rows were found")
        self.logger.info(f"-" * 50)
        self.logger.info(f"MISSING ROW COUNT:{len(error_list)} - See CSV output for details")
        self.logger.info(f"We ignored a total of {ignore_count} rows. We couldn't map these to the TargetDB. Either bad Var, Doc, or Config ID")

    def migrate_documents_status(self):
        """
        Migrate CurrentStatusID from source Documents to target Documents.
        Only updates CurrentStatusID if a state mapping exists in config.json.
        If no mapping exists, the target DB value remains unchanged.
        """
        self.logger.info("Starting Documents CurrentStatusID migration...")

        # Check if there are any state mappings
        if not self.state_map:
            self.logger.info("No state mappings configured - skipping CurrentStatusID migration")
            return

        # Get batch settings - use dedicated document status batch size if configured
        batch_size = self.config.get('migration', {}).get('document_status_batch_size', 5000)
        commit_interval = self.config.get('migration', {}).get('commit_interval', 10)

        self.logger.info(f"Migration settings: batch_size={batch_size}, commit_interval={commit_interval}")
        self.logger.info(f"Using executemany() for fast batch updates")

        # Fetch CurrentStatusID from source Documents
        source_documents = self.source_conn.execute_query("""
            SELECT DocumentID, CurrentStatusID
            FROM Documents
            WHERE CurrentStatusID IS NOT NULL
        """)

        self.logger.info(f"Found {len(source_documents)} documents with CurrentStatusID in source database")

        # Prepare update parameters
        update_params = []
        skipped_no_mapping = 0
        skipped_unmapped_doc = 0

        for doc in source_documents:
            source_doc_id = doc['DocumentID']
            source_status_id = doc['CurrentStatusID']

            # Map source DocumentID to target DocumentID
            target_doc_id = self.document_map.get(source_doc_id)

            if target_doc_id is None:
                # Document not mapped to target - skip
                skipped_unmapped_doc += 1
                continue

            # Map source CurrentStatusID to target CurrentStatusID
            target_status_id = self.state_map.get(source_status_id)

            if target_status_id is None:
                # No state mapping exists - leave target DB value unchanged
                self.logger.debug(f"Document {source_doc_id}: No state mapping for CurrentStatusID={source_status_id} - skipping")
                skipped_no_mapping += 1
                continue

            # Add to update list
            update_params.append((target_status_id, target_doc_id))

        self.logger.info(f"Prepared {len(update_params)} documents for CurrentStatusID update")
        self.logger.info(f"Skipped {skipped_unmapped_doc} documents (not mapped to target)")
        self.logger.info(f"Skipped {skipped_no_mapping} documents (no state mapping configured)")

        if not update_params:
            self.logger.info("No documents to update - migration complete")
            return

        # Update query
        update_query = """
            UPDATE Documents
            SET CurrentStatusID = ?
            WHERE DocumentID = ?
        """

        # Calculate batches
        total_batches = (len(update_params) + batch_size - 1) // batch_size

        # Initialize statistics
        total_updated = 0
        total_errors = 0

        # Process in batches
        for batch_num in range(total_batches):
            batch_start = batch_num * batch_size
            batch_end = min(batch_start + batch_size, len(update_params))
            batch_params = update_params[batch_start:batch_end]

            self.logger.info(f"Processing batch {batch_num + 1}/{total_batches} ({len(batch_params)} documents)...")

            # Execute updates for this batch using executemany (much faster than individual executes)
            try:
                cursor = self.target_conn.connection.cursor()
                cursor.executemany(update_query, batch_params)

                # executemany returns total affected rows
                batch_updated = cursor.rowcount
                total_updated += batch_updated

                cursor.close()

                self.logger.info(f"Batch {batch_num + 1} complete: {batch_updated} documents updated in this batch")

            except Exception as e:
                total_errors += len(batch_params)
                self.logger.error(f"Error updating batch {batch_num + 1}: {e}")
                self.logger.error(f"Failed to update {len(batch_params)} documents in this batch")

            # Commit every N batches
            if (batch_num + 1) % commit_interval == 0:
                self.target_conn.commit()
                self.logger.info(f"[COMMIT] Transaction committed at batch {batch_num + 1} (every {commit_interval} batches)")

        # Final commit
        self.target_conn.commit()
        self.logger.info(f"[SUCCESS] Final transaction committed")

        self.logger.info("Documents CurrentStatusID migration completed successfully!")
        self.logger.info(f"Total documents updated: {total_updated}")
        self.logger.info(f"Total errors: {total_errors}")

    def run(self):
        """Execute the complete file data migration process."""
        try:
            self.logger.info("=" * 50)
            self.logger.info("Starting File Data Migration")
            self.logger.info("=" * 50)

            # Build mappings
            self.build_variable_mapping()
            self.build_document_mapping()
            self.build_configuration_mapping()
            self.build_state_mapping()

            # Export mappings for verification
            self.export_mappings_to_csv()

            # Preview first batch of migrations
            # self.preview_migration(num_records=20)

            # Perform migration
            self.migrate_file_variable_values()

            # Migrate Documents CurrentStatusID if state mappings are configured
            # TODO DONT TOUCH THIS UNTIL A NEW VAULT MIGRATION
            # TODO State Mapping via SQL does not work
            # self.migrate_documents_status()

            # Perform validation
            # self.validate_migration()

            self.logger.info("=" * 50)
            self.logger.info("File Data Migration Completed Successfully!")
            self.logger.info("=" * 50)

        except Exception as e:
            self.logger.error(f"Migration failed: {e}", exc_info=True)
            raise

        finally:
            # Close connections
            self.source_conn.close()
            self.target_conn.close()


def main():
    """Main entry point."""
    migrator = FileDataMigration()
    migrator.run()


if __name__ == '__main__':
    main()