/*
 *   Copyright (c) International Business Machines  Corp., 2001
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * Module: mdregmgr
 * File: raid5_discover.c
 *
 * Description: This file contains all functions related to the initial
 *              discovery of raid5 MD physical volumes and logical
 *              volumes.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <plugin.h>

#define MY_PLUGIN raid5_plugin
#include "md.h"
#include "raid5_discover.h"
#include "raid5_mgr.h"


static int create_raid5_conf (md_volume_t * volume) {

	int i;
	raid5_conf_t * conf = mdvol_to_conf(volume);
	mdp_super_t * good_sb = volume->super_block;
	mdp_disk_t * sb_disk;
	disk_info_t * conf_disk;
	int disk_index;

	LOG_ENTRY;

	conf->mddev = volume;
	conf->failed_disk_index = -1;

	for (i = 0; i < MD_SB_DISKS; i++) {
		if (volume->child_object[i] != NULL) {
			/*
			 * This is important -- we are using the descriptor on
			 * the disk only to get a pointer to the descriptor on
			 * the main superblock, which might be more recent.
			 */
			sb_disk = good_sb->disks + i;
			disk_index = sb_disk->raid_disk;
			conf_disk = &conf->disks[disk_index];

			if (sb_disk->state & (1 << MD_DISK_FAULTY)) {
				LOG_WARNING("Disabled device %s (errors detected)\n", volume->child_object[i]->name);
				conf_disk->number = sb_disk->number;
				conf_disk->raid_disk = disk_index;
				conf_disk->dev = volume->child_object[i];

				conf_disk->operational = 0;
				conf_disk->write_only = 0;
				conf_disk->spare = 0;
				conf_disk->used_slot = 1;

				/*
				 * Save the failed disk index if the disk is
				 * part of the RAID and we don't already have
				 * a failed disk.
				 */
				if (disk_index < good_sb->raid_disks) {
					if (conf->failed_disk_index < 0) {
						conf->failed_disk_index = disk_index;
					}
				}
				conf->failed_disks++;
				continue;
			}

			if (sb_disk->state & (1 << MD_DISK_ACTIVE)) {
				if (!(sb_disk->state & MD_DISK_SYNC)) {
					LOG_WARNING("Disabled device %s (not in sync)\n", volume->child_object[i]->name);
					continue;
				}
				if (disk_index > good_sb->raid_disks) {
					LOG_WARNING("Disabled device %s (inconsistent descriptor)\n", volume->child_object[i]->name);
					continue;
				}
				if (conf_disk->operational) {
					LOG_WARNING("Disabled device %s (device %d already operational)\n", volume->child_object[i]->name, disk_index);
					continue;
				}
				LOG_DEBUG("Device %s operational as raid disk %d.\n", volume->child_object[i]->name, disk_index);

				conf_disk->number = sb_disk->number;
				conf_disk->raid_disk = disk_index;
				conf_disk->dev = volume->child_object[i];
				conf_disk->operational = 1;
				conf_disk->used_slot = 1;

				conf->active_disks++;
			} else {
				/*
				 * Must be a spare disk.
				 */
				LOG_DEBUG("spare disk %s\n", volume->child_object[i]->name);
				conf_disk->number = sb_disk->number;
				conf_disk->raid_disk = disk_index;
				conf_disk->dev = volume->child_object[i];

				conf_disk->operational = 0;
				conf_disk->write_only = 0;
				conf_disk->spare = 1;
				conf_disk->used_slot = 1;

				if (conf->spare.used_slot == 0) {
					conf->spare = *conf_disk;
				}
				conf->spare_disks++;
			}
		}
	}

	conf->chunk_size   = good_sb->chunk_size;
	conf->level        = good_sb->level;
	conf->algorithm    = good_sb->layout;
	conf->raid_disks   = good_sb->raid_disks;;
	conf->failed_raid_disks = conf->raid_disks - conf->active_disks;

	if (conf->failed_raid_disks != 0) {
		if (conf->failed_raid_disks == 1) {
			volume->flags |= MD_DEGRADED;

			if (conf->failed_disk_index < 0) {

				/* Find the missing disk. */
				for (i = 0; (i <conf->raid_disks) && (conf->failed_disk_index < 0); i++) {
					if (conf->disks[i].used_slot == 0) {
						conf->failed_disk_index = i;
					}
				}

				if (conf->failed_disk_index >= 0) {
					/*
					 * Find the entry for the failed disk in
					 * the super block so that we can fill
					 * in as much information as possible
					 * about the missing disk.
					 */
					for (i = 0; i < MD_SB_DISKS; i++) {
						if (volume->super_block->disks[i].raid_disk == conf->failed_disk_index) {
							conf_disk = &conf->disks[conf->failed_disk_index];
							conf_disk->number = volume->super_block->disks[i].number;
							conf_disk->raid_disk = conf->failed_disk_index;
							conf_disk->dev = NULL;

							conf_disk->operational = 0;
							conf_disk->write_only = 0;
							conf_disk->spare = 0;
							conf_disk->used_slot = 1;

							break;
						}
					}

				} else {
					/*
					 * One of the required RAID disks is not
					 * active, yet we could not find the
					 * failed disk and thus run in degrade
					 * mode.  Mark the volume corrupt.
					 */
					volume->flags &= ~MD_DEGRADED;
					volume->flags |= MD_CORRUPT;
				}
			}

			if (volume->flags & MD_DEGRADED) {
				EngFncs->user_message(MY_PLUGIN, NULL, NULL,
						      "RAID%d array %s is missing the member %s with RAID index %d.  "
						      "The array is running in degrade mode.\n",
						      conf->level, volume->name,
						      (conf->disks[conf->failed_disk_index].dev != NULL) ? conf->disks[conf->failed_disk_index].dev->name : "",
						      conf->failed_disk_index);
			}

		} else {
			/* Too many failed disks in the RAID. */
			volume->flags |= MD_CORRUPT;
		}
	}

	RETURN(0);
}


int raid5_create_region(md_volume_t * volume, dlist_t output_list, boolean final_call) {
	int rc = 0;
	storage_object_t * region;
	int found = 0;
	int i, j = -1;

	LOG_ENTRY;

	if ((!volume->super_array[0] || (volume->nr_disks !=  volume->super_array[0]->nr_disks)) &&
	    !final_call) {
		LOG_DETAILS("Region %s is missing members, delaying discovery.\n", volume->name);
		RETURN(0);
	}

	LOG_DETAILS("Discovered region %s.\n", volume->name);
	if ((rc = EngFncs->allocate_region(volume->name, &region))) {
		for (j = MAX_MD_MINORS -1;(rc != 0) && (j >=0) ; j--) {
			sprintf(volume->name, "md/md%d", j);
			rc = EngFncs->allocate_region(volume->name, &region);
		}
		if (j<0) {
			LOG_ERROR("No more names for MD.\n");
			RETURN(ENOMEM);
		}
	}
	region->size = 0;
	for (i = 0; (i < MAX_MD_DEVICES) && (found < volume->nr_disks); i++ ) {
		/* Check for null object, if missing, skip. */
		if (volume->child_object[i]) {
			/*
			 * If name registration failed and we changed the name,
			 * fix up all the minor numbers.
			 */
			if (j >= 0) {
				volume->super_array[i]->md_minor = j;
			}
			md_append_region_to_object(region, volume->child_object[i]);
			LOG_DETAILS("Adding object %s to volume %s.\n", volume->child_object[i]->name, volume->name);
			found++;
		}
	}

	region->size = (volume->super_block->raid_disks - 1) * (volume->super_block->size * (BLOCK_SIZE >> EVMS_VSECTOR_SIZE_SHIFT));
	region->data_type = DATA_TYPE;
	region->plugin = raid5_plugin;
	region->private_data = (void *)volume;

	volume->flags |= MD_DISCOVERED;
	volume->region = region;

	if (raid5_verify_and_fix_array(volume, 0, 1)) {
		int    answer = 0;
		char * choice_text[3] = { "Don't Fix", "Fix", NULL};
		EngFncs->user_message(my_plugin, &answer,choice_text,
				      "MD region %s has inconsistent metadata.  "
				      "If you elect not to fix the region at this time, "
				      "you may do so later by using \"Modify properties\" for the region.  "
				      "Changes will not be written to disk until you select to commit the changes.\n",
				      volume->name);
		if (answer) {
			raid5_verify_and_fix_array(volume, 1, 0);
		}
	}

	rc = md_allocate_memory(&volume->private_data, sizeof (raid5_conf_t));
	if (rc == 0) {
		if (!(volume->flags & MD_CORRUPT)) {
			rc = create_raid5_conf(volume);
		}

		if (volume->flags & MD_CORRUPT) {
			region->flags |= SOFLAG_CORRUPT;
		}

	} else {
		LOG_CRITICAL("Error %d allocating memory for raid5 configuration structure.\n", rc);
		volume->flags |= MD_CORRUPT;
		region->flags |= SOFLAG_CORRUPT;
	}

	if ((volume->flags & MD_DIRTY) && !(volume->flags & MD_CORRUPT)) {
		/* Mark region dirty. */
		region->flags |= SOFLAG_DIRTY;
		EngFncs->set_changes_pending();
	}

	md_add_object_to_list(region, output_list);
	RETURN(rc);
}




/* Function: discover_regions
 *
 *  run the global list of regions and piece them together.
 */
int raid5_discover_regions( dlist_t output_list, int *count, boolean final_call ) {
	int rc = 0;
	md_volume_t * volume = volume_list_head;

	LOG_ENTRY;

	while (volume != NULL) {
		if ((!(volume->flags & MD_DISCOVERED)) && (volume->personality == RAID5)) {
			rc = raid5_create_region(volume, output_list, final_call);
			if (volume->flags & MD_DISCOVERED) {
				*count = *count + 1;
			}
		}

		volume = volume->next;
	}

	RETURN(rc);
}


/*
 * Verify the raid 4/5 array.  If 'fix' is 0 then just perform validation,
 * return 0 if array is OK, 1 if array needs to be fixed.  If 'fix' is 1, then
 * fix up the array on the fly and return 1 if array was modified, 0 if
 * untouched.
 */
int  raid5_verify_and_fix_array(md_volume_t * volume, int fix, int do_msg) {

	int     i, k;
	int     change = 0;
	int     nr_disks = 0, spare_disks = 0, working_disks = 0, active_disks = 0;
	int     failed_disks = 0;
	int     found_disks = 0;
	int     major, minor;
	mdp_disk_t disk;
	mdp_disk_t disk2;

	LOG_ENTRY;

	for (i = 0; (i < MAX_MD_DEVICES) && ((nr_disks < volume->super_block->nr_disks) || (found_disks < volume->nr_disks)); i++ ) {
		nr_disks++;
		if (volume->child_object[i] == NULL) {
			failed_disks ++;
			
			if ((volume->super_block->disks[i].major != 0) &&
			    !(volume->super_block->disks[i].state & (1 << MD_DISK_FAULTY))) {
				/*
				 * The super block says there should be a
				 * non-faulty disk in this slot.
				 */
				if (!fix) {
					if (do_msg) {
						MESSAGE("Region %s missing raid array object %d.  "
							"Possible identifier of missing object is Major=%d Minor=%d.\n",
							volume->name, i, volume->super_block->disks[i].major, volume->super_block->disks[i].minor);
					}
				}
				/*
				 * If the hole is after the RAID array entries,
				 * it can be fixed up.
				 */
				if (i >= volume->super_block->raid_disks) {
					change= TRUE;
					if (fix) {
						/* Collapse super array and object array. */
						for (k = i; k < (MAX_MD_DEVICES - 1); k++) {
							volume->super_array[k]= volume->super_array[k+1];
							volume->child_object[k] = volume->child_object[k+1];
							volume->super_block->disks[k]= volume->super_block->disks[k+1];
						}
						/* Zero out now empty disk entry. */
						memset(&volume->super_block->disks[k],0,sizeof(mdp_disk_t));
						failed_disks--;

						/* try again */
						nr_disks--;
						i--;
					}
				}
			}

		} else {
			found_disks++;
			if (volume->super_block->disks[i].number != i ||
			    volume->super_block->disks[i].raid_disk != i) {
				change= TRUE;
				if (fix) {
					volume->super_block->disks[i].number = i;
					volume->super_block->disks[i].raid_disk = i;
				} else {
					if (do_msg) {
						MESSAGE("Region %s object index %d has an internal index (%d) that is not correct.\n", volume->name, i, volume->super_block->disks[i].number);
					}
				}
			}

			get_legacy_dev(volume, volume->child_object[i]->name, &major, &minor);
			if (volume->super_block->disks[i].major != major ||
			    volume->super_block->disks[i].minor != minor) {
				change = TRUE;
				if (fix) {
					volume->super_block->disks[i].major = major;
					volume->super_block->disks[i].minor = minor;
				} else {
					if (do_msg) {
						MESSAGE("Region %s object index %d has incorrect major/minor.\n", volume->name, i);
					}
				}
			}

			switch (volume->super_block->disks[i].state) {
			case ((1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC)):
				active_disks++;
				working_disks++;
				break;

				/*
				 * Active but not sync, or not active but sync.  Kernel
				 * just kind of ignores these drives so if it's inside
				 * the raid, mark it faulty, else convert it to a spare
				 * disk.
				 */
			case (1 << MD_DISK_ACTIVE):
			case (1 << MD_DISK_SYNC):
				change= TRUE;
				if (fix) {
					if (i < volume->super_block->raid_disks) {
						volume->super_block->disks[i].state = (1 << MD_DISK_FAULTY);
						failed_disks++;
					} else {
						volume->super_block->disks[i].state = 0;
						spare_disks++;
						working_disks++;
					}
				} else {
					if (do_msg) {
						MESSAGE("Region %s object index %d is not in a valid state.\n", volume->name, i);
					}
				}
				break;

			case (1 << MD_DISK_NEW):
			case 0:	/* A state of 0 means it's a spare */
				spare_disks++;
				working_disks++;
				break;

			case (1 << MD_DISK_REMOVED):
			case (1 << MD_DISK_FAULTY):
			case ((1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED)):
			default:
				if (!fix) {
					if (do_msg) {
						if (i < volume->super_block->raid_disks) {
							MESSAGE("Region %s object index %d is faulty. Array may be degraded.\n",volume->name, i);
						}
					}
				}
				failed_disks++;
				break;
			}
		}
	}

	/*
	 * Check to be sure that all of the unused disks array entries are
	 * zeroed.  If not, the boneheaded kernel MD code will use these even
	 * though the count field indicates that they are not valid.  To make
	 * matters worse, only raid4/5 and 1 work this way, so since we have a
	 * common SB creation routine we can not always be right.  So just allow
	 * these extras disks entries to have the sync bit on or off.
	 */
	memset(&disk, 0, sizeof(mdp_disk_t));
	disk.state = (1 << MD_DISK_SYNC);
	
	memset(&disk2, 0, sizeof(mdp_disk_t));

	for (i = max(i, volume->super_block->raid_disks); i < MAX_MD_DEVICES; i++) {
		if (memcmp(&disk, &volume->super_block->disks[i], sizeof(mdp_disk_t)) &&
		    memcmp(&disk2, &volume->super_block->disks[i], sizeof(mdp_disk_t))) {
			change = TRUE;
			if (fix) {
				memcpy(&volume->super_block->disks[i], &disk, sizeof(mdp_disk_t));
			} else {
				if (do_msg) {
					MESSAGE("Region %s has unused disk array entries that are not zeroed.\n", volume->name);
				}
			}
		}
	}

	if (volume->super_block->active_disks != active_disks ||
	    volume->super_block->working_disks != working_disks ||
	    volume->super_block->failed_disks != failed_disks ||
	    volume->super_block->spare_disks != spare_disks ||
	    volume->super_block->raid_disks > nr_disks ||
	    volume->super_block->nr_disks != nr_disks ) {
		change = TRUE;
		if (fix) {
			volume->super_block->active_disks = active_disks;
			volume->super_block->working_disks = working_disks;
			volume->super_block->failed_disks = failed_disks;
			volume->super_block->spare_disks = spare_disks;
			nr_disks = max(nr_disks, volume->super_block->raid_disks);
			volume->super_block->nr_disks = nr_disks;
		} else {
			if (do_msg) {
				MESSAGE("Region %s has disk counts that are not correct.\n", volume->name);
			}
		}
	}

	if (fix && change) {
		volume->region->flags |= SOFLAG_DIRTY;
		md_rediscover_volumes_for_region(volume->region);

		volume->flags &= ~(MD_CORRUPT | MD_DEGRADED);

		if (volume->super_block->raid_disks != active_disks) {
			if ((volume->super_block->raid_disks - active_disks) == 1) {
				volume->flags |= MD_DEGRADED;
			} else {
				volume->flags |= MD_CORRUPT;
			}
		}

		if (!(volume->flags & MD_CORRUPT)) {
			/* Reset the configuration data. */
			if (mdvol_to_conf(volume) != NULL) {
				memset(mdvol_to_conf(volume), 0, sizeof(raid5_conf_t));
				create_raid5_conf(volume);
			}
		}

		EngFncs->set_changes_pending();
	}
	RETURN(change);
}

